From af675e5c3c2eb715aa8a161976ae9cc2c649db26 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 10 Sep 2024 15:56:40 +0200 Subject: [PATCH 1/3] Add codes to reasons for filtering out an HPO term in MTC filter. Do not show untested HPO terms in the summary frame. Fixes #228 --- docs/report/tbx5_frameshift_vs_missense.csv | 26 +- ...bx5_frameshift_vs_missense.mtc_report.html | 77 ++---- docs/tutorial.rst | 10 +- docs/user-guide/mtc.rst | 126 +++++----- docs/user-guide/report/tbx5_frameshift.csv | 28 +-- .../report/tbx5_frameshift.mtc_report.html | 81 ++---- docs/user-guide/stats.rst | 9 +- src/gpsea/analysis/mtc_filter/_impl.py | 238 +++++++++++++----- src/gpsea/view/_phenotype_analysis.py | 8 +- src/gpsea/view/_stats.py | 17 +- src/gpsea/view/templates/stats.html | 5 +- tests/analysis/test_mtc_filter.py | 1 - tests/view/test_stats.py | 2 +- 13 files changed, 338 insertions(+), 290 deletions(-) diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 772f062f9..76666c2a0 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,22 +1,18 @@ Genotype group,Missense,Missense,Frameshift,Frameshift,, ,Count,Percent,Count,Percent,Corrected p values,p values -Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0009552459156234353,5.6190936213143254e-05 -Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003695652173913043,0.00043478260869565214 -Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015 -Heart block [HP:0012722],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015 -Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.0191369345329502,0.005628510156750059 -Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.038236617183985605,0.01349527665317139 -Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.062175372424826694,0.02560162393963452 -Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.15811357916621074,0.07440639019586388 -Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2690764879148444,0.1424522583078588 -Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2868675985983051,0.1687456462342971 -Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855 -Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713 +Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0008990549794102921,5.6190936213143254e-05 +Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003478260869565217,0.00043478260869565214 +Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.01932367149758454,0.0036231884057971015 +Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.022514040627000236,0.005628510156750059 +Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.04318488529014845,0.01349527665317139 +Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.06827099717235872,0.02560162393963452 +Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.17007174901911745,0.07440639019586388 +Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2849045166157176,0.1424522583078588 +Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.29999225997208373,0.1687456462342971 +Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6857142857142857,0.42857142857142855 +Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.831168831168831,0.5714285714285713 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 -Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,, -Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,, -Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, diff --git a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html index b6ac17472..14f949c39 100644 --- a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html +++ b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html @@ -48,9 +48,9 @@

Phenotype testing report

Phenotype MTC filter: HPO MTC filter

Multiple testing correction: fdr_bh

-

Performed statistical tests for 17 out of the total of 260 HPO terms.

+

Performed statistical tests for 16 out of the total of 260 HPO terms.

- + @@ -59,80 +59,45 @@

Phenotype testing report

- - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + diff --git a/docs/tutorial.rst b/docs/tutorial.rst index dcecee150..8917ce46a 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -246,15 +246,15 @@ Now we can perform the analysis and investigate the results. ... pheno_predicates=pheno_predicates, ... ) >>> result.total_tests -17 +16 -We only tested 1y HPO terms. This is despite the individuals being collectively annotated with +We only tested 16 HPO terms. This is despite the individuals being collectively annotated with 260 direct and indirect HPO terms >>> len(result.phenotypes) 260 -We can show the reasoning behind *not* testing 243 (`260 - 17`) HPO terms +We can show the reasoning behind *not* testing 244 (`260 - 16`) HPO terms by exploring the phenotype MTC filtering report. >>> from gpsea.view import MtcStatsViewer @@ -266,11 +266,11 @@ by exploring the phenotype MTC filtering report. .. raw:: html :file: report/tbx5_frameshift_vs_missense.mtc_report.html -and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: +and these are the tested HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: >>> from gpsea.view import summarize_hpo_analysis >>> summary_df = summarize_hpo_analysis(hpo, result) ->>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP +>>> summary_df.to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense :file: report/tbx5_frameshift_vs_missense.csv diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst index 070539d19..b01c95d1f 100644 --- a/docs/user-guide/mtc.rst +++ b/docs/user-guide/mtc.rst @@ -194,60 +194,74 @@ The constructor takes a threshold as an argument (e.g. 20% in the example above) and the method's logic is made up of 8 individual heuristics designed to skip testing the HPO terms that are unlikely to yield significant or interesting results: -#. Skip terms that occur very rarely - The ``term_frequency_threshold`` determines the mininum proportion of individuals - with direct or indirect annotation by the HPO term to test. - We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), and we only retain a term for testing - if the proportion of individuals in at least one genotype group is greater than or equal to ``term_frequency_threshold``. - - This is because of our assumption that even if there is statistical significance, - if a term is only seen in (for example) 7% of individuals in the MISSENSE group and 2% in the not-MISSENSE group, - the term is unlikely to be of great interest because it is rare. - -#. Skip terms if no cell has more than one count - In a related heuristic, we skip terms if no genotype group has more than one count. - This is not completely redundant with the previous condition, - because some terms may have a small number of total observations. - -#. Skip terms if all counts are identical to counts for a child term - Let's say a term such as - `Posterior polar cataract (HP:0001115) `_ - was observed in 7 of 11 individuals with MISSENSE variants - and in 3 of 8 individuals with NONSENSE variants. - If we find the same patient counts (7 of 11 and 3 of 8) in the parent term - `Polar cataract HP:0010696 `_, - then we choose to not test the parent term. - - This is because the more specific an HPO term is, - the more information it has (the more interesting the correlation would be if it exists), - and the result of the Fisher Exact test for *Polar cataract* - would be exactly the same as for *Posterior polar cataract*. - -#. Skip terms if genotypes have same HPO proportions - If both (or all) of the genotype groups have the same proportion of individuals - observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, - because it is not possible that the Fisher exact test will return a significant result. - -#. Skip terms if there are no HPO observations in a group - If one of the genotype groups has neither observed nor excluded observations for an HPO term, skip it. - -#. Skipping terms that are not descendents of `Phenotypic abnormality (HP:0000118) `_ - The HPO has a number of other branches that describe modes of inheritance, - past medical history, and clinical modifiers. - We do not think it makes much sense to test for enrichment of these terms, - and so they are filtered out. - -#. Skipping "general" level terms - All the direct children of the root phenotype term - `Phenotypic abnormality (HP:0000118) `_ are skipped, - because of the assumption that if there is a valid signal, - it will derive from one of the more specific descendents. - - For instance, `Abnormality of the nervous system (HP:0000707) `_ - is a child of *Phenotypic abnormality*, and this assumption implies - that if there is a signal from the nervous system, - it will lead to at least one of the descendents of - *Abnormality of the nervous system* being significant. - - See :ref:`general-hpo-terms` section for details. ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| Code | Name | Description | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF01` | Skip terms that | The ``term_frequency_threshold`` determines the mininum proportion of individuals | +| | occur very rarely | with direct or indirect annotation by the HPO term to test. | +| | | We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), | +| | | and we only retain a term for testing if the proportion of individuals | +| | | in at least one genotype group is greater than | +| | | or equal to ``term_frequency_threshold``. | +| | | This is because of our assumption that even if there is statistical significance, | +| | | if a term is only seen in (for example) 7% of individuals | +| | | in the MISSENSE group and 2% in the not-MISSENSE group, | +| | | the term is unlikely to be of great interest because it is rare. | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF02` | Skip terms if | In a related heuristic, we skip terms if no genotype group has more | +| | no cell has more | than one count. This is not completely redundant with the previous condition, | +| | than one count | because some terms may have a small number of total observations. | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF03` | Skip terms if | Let's say a term such as | +| | all counts are | `Posterior polar cataract (HP:0001115) `_ | +| | identical | was observed in 7 of 11 individuals with MISSENSE variants | +| | to counts | and in 3 of 8 individuals with NONSENSE variants. | +| | for a child | If we find the same patient counts (7 of 11 and 3 of 8) in the parent term | +| | term | `Polar cataract HP:0010696 `_, | +| | | then we choose to not test the parent term. | +| | | | +| | | This is because the more specific an HPO term is, | +| | | the more information it has (the more interesting the correlation would be if it exists), | +| | | and the result of a test, such as the Fisher Exact test, would be exactly the same | +| | | for *Polar cataract* as for *Posterior polar cataract*. | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF04` | Skip terms if | If both (or all) of the genotype groups have the same proportion of individuals | +| | genotypes have | observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, | +| | same HPO | because it is not possible that the Fisher exact test will return a significant result. | +| | proportions | | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF05` | Skip terms if | If one of the genotype groups has neither observed nor excluded observations | +| | there are no | for an HPO term, skip it. | +| | HPO observations | | +| | in a group | | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF06` | Skip term if | If the individuals are binned into 2 phenotype groups and 2 genotype groups (2x2) | +| | underpowered | and the total count of patients in all genotype-phenotype groups is less than 7, | +| | for 2x2 or 2x3 | or into 2 phenotype groups and 3 genotype groups (2x3) and the total count of patients | +| | analysis | is less than 6, then there is a lack even of the nominal statistical power | +| | | and the counts can never be significant. | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF07` | Skipping terms | The HPO has a number of other branches that describe modes of inheritance, | +| | that are not | past medical history, and clinical modifiers. | +| | descendents of | We do not think it makes much sense to test for enrichment of these terms, | +| | *Phenotypic* | so, all terms that are not descendants of | +| | *abnormality* | `Phenotypic abnormality `_ are filtered out. | +| | | | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF08` | Skipping | All the direct children of the root phenotype term | +| | "general" | `Phenotypic abnormality (HP:0000118) `_ | +| | level terms | are skipped, because of the assumption that if there is a valid signal, | +| | | it will derive from one of the more specific descendents. | +| | | | +| | | For instance, | +| | |`Abnormality of the nervous system `_ | +| | | (HP:0000707) is a child of *Phenotypic abnormality*, and this assumption implies | +| | | that if there is a signal from the nervous system, | +| | | it will lead to at least one of the descendents of | +| | | *Abnormality of the nervous system* being significant. | +| | | | +| | | See :ref:`general-hpo-terms` section for details. | +| | | | ++------------+-------------------+--------------------------------------------------------------------------------------------+ + diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv index 4851c198a..39c76f0bb 100644 --- a/docs/user-guide/report/tbx5_frameshift.csv +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -1,22 +1,18 @@ What is the genotype group,HOM_REF,HOM_REF,HET,HET,, ,Count,Percent,Count,Percent,Corrected p values,p values -Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825 -Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387 -Absent thumb [HP:0009777],18/100,18%,14/31,45%,0.021044138590779585,0.003713671516019927 -Atrioventricular block [HP:0001678],1/23,4%,2/2,100%,0.034,0.01 -Heart block [HP:0012722],1/23,4%,2/2,100%,0.034,0.01 -Patent ductus arteriosus [HP:0001643],6/40,15%,2/2,100%,0.09214092140921408,0.03252032520325203 -Secundum atrial septal defect [HP:0001684],23/55,42%,4/22,18%,0.1440020479198931,0.06544319142266644 -Triphalangeal thumb [HP:0001199],23/99,23%,13/32,41%,0.1440020479198931,0.06932119159387057 -Cardiac conduction abnormality [HP:0031546],15/37,41%,3/3,100%,0.1440020479198931,0.08259109311740892 -Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.1440020479198931,0.08470708701170182 -Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.6899307928951143,0.4666666666666667 -Short thumb [HP:0009778],25/69,36%,8/30,27%,0.6899307928951143,0.48700997145537483 +Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.003870827221893892,0.00024192670136836825 +Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01230769230769231,0.0015384615384615387 +Absent thumb [HP:0009777],18/100,18%,14/31,45%,0.01980624808543961,0.003713671516019927 +Atrioventricular block [HP:0001678],1/23,4%,2/2,100%,0.04,0.01 +Patent ductus arteriosus [HP:0001643],6/40,15%,2/2,100%,0.10406504065040649,0.03252032520325203 +Secundum atrial septal defect [HP:0001684],23/55,42%,4/22,18%,0.15059037690969213,0.06544319142266644 +Triphalangeal thumb [HP:0001199],23/99,23%,13/32,41%,0.15059037690969213,0.06932119159387057 +Cardiac conduction abnormality [HP:0031546],15/37,41%,3/3,100%,0.15059037690969213,0.08259109311740892 +Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.15059037690969213,0.08470708701170182 +Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.708378140298727,0.4666666666666667 +Short thumb [HP:0009778],25/69,36%,8/30,27%,0.708378140298727,0.48700997145537483 Absent radius [HP:0003974],9/43,21%,6/25,24%,1.0,0.7703831604944444 +Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0 Hypoplasia of the radius [HP:0002984],34/75,45%,6/14,43%,1.0,1.0 Hypoplasia of the ulna [HP:0003022],3/17,18%,2/10,20%,1.0,1.0 Short humerus [HP:0005792],8/21,38%,4/9,44%,1.0,1.0 -Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0 -Aplasia/Hypoplasia of the thumb [HP:0009601],40/40,100%,19/19,100%,, -Aplasia/Hypoplasia of fingers [HP:0006265],44/44,100%,19/19,100%,, -Aplasia/hypoplasia involving bones of the hand [HP:0005927],44/44,100%,19/19,100%,, diff --git a/docs/user-guide/report/tbx5_frameshift.mtc_report.html b/docs/user-guide/report/tbx5_frameshift.mtc_report.html index 3d00300a0..242057f92 100644 --- a/docs/user-guide/report/tbx5_frameshift.mtc_report.html +++ b/docs/user-guide/report/tbx5_frameshift.mtc_report.html @@ -48,9 +48,9 @@

Phenotype testing report

Phenotype MTC filter: HPO MTC filter

Multiple testing correction: fdr_bh

-

Performed statistical tests for 17 out of the total of 260 HPO terms.

+

Performed statistical tests for 16 out of the total of 260 HPO terms.

Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.
Code
TODOHMF01 Skipping term with maximum frequency that was less than threshold 0.2 51
TODOSkipping general term44
TODOSkipping term because all genotypes have same HPO observed proportions41
TODOSkipping term with only 2 observations (not powered for 2x2)26
TODOSkipping term with only 1 observations (not powered for 2x2)24
TODOSkipping term with only 3 observations (not powered for 2x2)22HMF02Skipping term because no genotype has more than one observed HPO count3
TODOSkipping term with only 4 observations (not powered for 2x2)14HMF03Skipping term because of a child term with the same individual counts1
TODOSkipping term with only 6 observations (not powered for 2x2)12HMF04Skipping term because all genotypes have same HPO observed proportions41
TODOSkipping term with only 5 observations (not powered for 2x2)4HMF05Skipping term because one genotype had zero observations2
TODOSkipping term because no genotype has more than one observed HPO count3HMF06Skipping term with less than 7 observations (not powered for 2x2)102
TODOSkipping term because one genotype had zero observations2HMF08Skipping general term44
- + @@ -59,80 +59,45 @@

Phenotype testing report

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 5132b4b51..991fbd346 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -230,10 +230,10 @@ We can now execute the analysis: >>> len(result.phenotypes) 260 >>> result.total_tests -17 +16 -Thanks to Phenotype MTC filter, we only tested 17 out of 260 terms. +Thanks to Phenotype MTC filter, we only tested 16 out of 260 terms. We can learn more by showing the MTC filter report: >>> from gpsea.view import MtcStatsViewer @@ -251,12 +251,11 @@ Genotype phenotype associations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Last, let's explore the associations. The results include a table with all tested HPO terms -ordered by the corrected p value (Benjamini-Hochberg FDR). -Here we show the top 20 table rows: +ordered by the corrected p value (Benjamini-Hochberg FDR): >>> from gpsea.view import summarize_hpo_analysis >>> summary_df = summarize_hpo_analysis(hpo, result) ->>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP +>>> summary_df.to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest :file: report/tbx5_frameshift.csv diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py index c52d10b9c..332a53a21 100644 --- a/src/gpsea/analysis/mtc_filter/_impl.py +++ b/src/gpsea/analysis/mtc_filter/_impl.py @@ -1,10 +1,10 @@ import abc +import dataclasses import typing from collections import deque import hpotk -from matplotlib import axis import numpy as np import pandas as pd @@ -12,51 +12,83 @@ from ..predicate.phenotype import PhenotypePolyPredicate, P +@dataclasses.dataclass(eq=True, frozen=True) +class PhenotypeMtcIssue: + """ + The container for data available regarding the reason why a phenotype was filtered out. + """ + + code: str + """ + A `str` with a unique code of the issue. + """ + + reason: str + """ + A human-friendly explanation of the issue. + """ + + class PhenotypeMtcResult: """ `PhenotypeMtcResult` represents a result of :class:`PhenotypeMtcFilter` for a single phenotype. The phenotype can either pass the filter, in order to be included in the downstream analysis (:meth:`is_passed`) - of be filtered out (:meth:`is_filtered_out`) in which case :attr:`reason` must be available. + of be filtered out (:meth:`is_filtered_out`) in which case :attr:`mtc_issue` with more context + regarding culprit must be available. """ @staticmethod def ok() -> "PhenotypeMtcResult": # A singleton would be nice here... - return PhenotypeMtcResult(status=True, reason=None) + return PhenotypeMtcResult(status=True, issue=None) @staticmethod - def fail(reason: str) -> "PhenotypeMtcResult": - return PhenotypeMtcResult(status=False, reason=reason) + def fail(code: str, reason: str) -> "PhenotypeMtcResult": + issue = PhenotypeMtcIssue(code=code, reason=reason) + return PhenotypeMtcResult(status=False, issue=issue) def __init__( self, status: bool, - reason: typing.Optional[str], + issue: typing.Optional[PhenotypeMtcIssue], ): + assert isinstance(status, bool) + if status: + assert issue is None, '`issue` must NOT be provided if the HPO term passed the MTC filter' + else: + assert issue is not None, '`issue` must be provided if the HPO term failed the MTC filter' + self._status = status - self._reason = reason + self._issue = issue def is_passed(self) -> bool: return self._status def is_filtered_out(self) -> bool: return not self._status - + + @property + def mtc_issue(self) -> typing.Optional[PhenotypeMtcIssue]: + return self._issue + @property def reason(self) -> typing.Optional[str]: - return self._reason + if self.mtc_issue is None: + return None + else: + return self.mtc_issue.reason def __eq__(self, value: object) -> bool: return isinstance(value, PhenotypeMtcResult) \ and self._status == value._status \ - and self._reason == value._reason + and self._issue == value._issue def __hash__(self) -> int: - return hash((self._status, self._reason)) + return hash((self._status, self._issue)) def __str__(self) -> str: - return f'PhenotypeMtcResult(status={self._status}, reason={self._reason})' + return f'PhenotypeMtcResult(status={self._status}, issue={self._issue})' def __repr__(self) -> str: return str(self) @@ -71,6 +103,14 @@ class PhenotypeMtcFilter(typing.Generic[P], metaclass=abc.ABCMeta): Therefore, the expected input asks for :class:`hpotk.TermId` items. For instance, `n_usable` is a mapping from an *HPO term* to an `int` with the count of the patients categorized according to the HPO term. + + :attr:`PhenotypeMtcFilter.OK` is returned for HPO terms that pass MTC filtering + and *should* be included in the analysis. + """ + + OK = PhenotypeMtcResult.ok() + """ + The MTC result for the phenotypes that pass the filtering and *should* be included in the analysis. """ @abc.abstractmethod @@ -91,6 +131,13 @@ def filter( """ pass + @abc.abstractmethod + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + """ + Return all possible result types which the `PhenotypeMtcFilter` can produce. + """ + pass + @abc.abstractmethod def filter_method_name(self) -> str: """ @@ -106,9 +153,6 @@ class UseAllTermsMtcFilter(PhenotypeMtcFilter[typing.Any]): See :ref:`use-all-terms-strategy` section for more info. """ - def __init__(self): - self._ok = PhenotypeMtcResult.ok() - def filter( self, gt_predicate: GenotypePolyPredicate, @@ -116,7 +160,10 @@ def filter( counts: typing.Sequence[pd.DataFrame], ) -> typing.Sequence[PhenotypeMtcResult]: # Always OK! 😏 - return tuple(self._ok for _ in ph_predicates) + return tuple(PhenotypeMtcFilter.OK for _ in ph_predicates) + + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + return (PhenotypeMtcFilter.OK,) def filter_method_name(self) -> str: return "All HPO terms" @@ -133,6 +180,11 @@ class SpecifiedTermsMtcFilter(PhenotypeMtcFilter[hpotk.TermId]): See :ref:`specify-terms-strategy` section for more info. """ + + NON_SPECIFIED_TERM = PhenotypeMtcResult.fail(code="ST1", reason="Non-specified term") + """ + The MTC filtering result returned when an HPO term does not belong among the selection of terms to be tested. + """ def __init__( self, @@ -142,8 +194,6 @@ def __init__( Args: terms_to_test: an iterable of TermIds representing the terms to test """ - self._ok = PhenotypeMtcResult.ok() - self._fail = PhenotypeMtcResult.fail("Non-specified term") self._terms_to_test_set = set(terms_to_test) def filter( @@ -155,10 +205,16 @@ def filter( results = [] for predicate in ph_predicates: if predicate.phenotype in self._terms_to_test_set: - results.append(self._ok) + results.append(SpecifiedTermsMtcFilter.OK) else: - results.append(self._fail) + results.append(SpecifiedTermsMtcFilter.NON_SPECIFIED_TERM) return tuple(results) + + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + return ( + PhenotypeMtcFilter.OK, + SpecifiedTermsMtcFilter.NON_SPECIFIED_TERM, + ) def filter_method_name(self) -> str: return "Specified terms MTC filter" @@ -174,16 +230,21 @@ class HpoMtcFilter(PhenotypeMtcFilter[hpotk.TermId]): We recommend creating an instance using the :func:`default_filter` static factory method. """ - # TODO: this has been here before. Is it still valid? - # 2. Do not perform a test if the counts in the genotype categories do not even have nominal statistical power - # for i in range(2,6): - # for j in range(2,6): - # # create a table with the most extreme possible counts. If this is not significant, then - # # other counts also cannot be - # two_by_two_table = [[i, 0], [0, j]] - # oddsr, p = scipy.stats.fisher_exact(two_by_two_table, alternative='two-sided') - # This code reveals that (2,4), (4,2), (2,3), (3,3), (2,2) and (3,2) are not powered so we can always skip them - # ## TODO -- similar calculate for 3x2 + NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO = PhenotypeMtcResult.fail( + "HMF02", "Skipping term because no genotype has more than one observed HPO count", + ) + SAME_COUNT_AS_THE_ONLY_CHILD = PhenotypeMtcResult.fail( + "HMF03", "Skipping term because of a child term with the same individual counts", + ) + SKIPPING_SAME_OBSERVED_PROPORTIONS = PhenotypeMtcResult.fail( + "HMF04", + "Skipping term because all genotypes have same HPO observed proportions", + ) + SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS = PhenotypeMtcResult.fail( + "HMF05", "Skipping term because one genotype had zero observations" + ) + SKIPPING_NON_PHENOTYPE_TERM = PhenotypeMtcResult.fail("HMF07", "Skipping non phenotype term") + SKIPPING_GENERAL_TERM = PhenotypeMtcResult.fail("HMF08", "Skipping general term") @staticmethod def default_filter( @@ -262,13 +323,40 @@ def __init__( """ self._hpo = hpo self._hpo_term_frequency_filter = term_frequency_threshold - # The following numbers of total observations in the genotype groups can never be significant, - # so we just skip them see above explanation - self._powerless_set = {(2, 4), (4, 2), (2, 3), (3, 3), (2, 2), (3, 2)} - # thus if the total count is 6 or less, there is no power - CAN WE SIMPLIFY? self._general_hpo_terms = set(general_hpo_terms) - self._ok = PhenotypeMtcResult.ok() + + self._below_frequency_threshold = PhenotypeMtcResult.fail( + code="HMF01", + reason="Skipping term with maximum frequency that was" + f" less than threshold {self._hpo_term_frequency_filter}", + ) + + # Do not perform a test if the counts in the genotype categories do not even have nominal statistical power + # The following numbers of total observations in the genotype groups can never be significant, + # so we just skip them. + # ``` + # for i in range(2,6): + # for j in range(2,6): + # # create a table with the most extreme possible counts. If this is not significant, then + # # other counts also cannot be + # two_by_two_table = [[i, 0], [0, j]] + # oddsr, p = scipy.stats.fisher_exact(two_by_two_table, alternative='two-sided') + # ``` + # This code reveals that (2,4), (4,2), (2,3), (3,3), (2,2) and (3,2) are not powered so we can always skip them + # Thus if the total count is 6 or less, there is no power. + self._min_observations_for_2_by_2 = 7 + self._not_powered_for_2_by_2 = PhenotypeMtcResult.fail( + code="HMF06", + reason=f"Skipping term with less than {self._min_observations_for_2_by_2} observations" + " (not powered for 2x2)", + ) + self._min_observations_for_2_by_3 = 6 + self._not_powered_for_2_by_3 = PhenotypeMtcResult.fail( + code="HMF06", + reason=f"Skipping term with less than {self._min_observations_for_2_by_3} observations" + " (not powered for 2x3)", + ) def filter( self, @@ -290,46 +378,39 @@ def filter( continue if term_id in self._general_hpo_terms: - results[idx] = PhenotypeMtcResult.fail("Skipping general term") + results[idx] = HpoMtcFilter.SKIPPING_GENERAL_TERM continue if not self._hpo.graph.is_ancestor_of( hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY, term_id ): - results[idx] = PhenotypeMtcResult.fail("Skipping non phenotype term") + results[idx] = HpoMtcFilter.SKIPPING_NON_PHENOTYPE_TERM continue - # get total number of observations - # if term_id not in all_counts: - # reason_for_filtering_out["Skipping non-target term"] += 1 - # continue - ph_predicate = ph_predicates[idx] contingency_matrix = counts[idx] - total = contingency_matrix.sum().sum() + max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( contingency_matrix, ph_predicate=ph_predicate, ) if max_freq < self._hpo_term_frequency_filter: - reason = ( - "Skipping term with maximum frequency " - f"that was less than threshold {self._hpo_term_frequency_filter}" - ) - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = self._below_frequency_threshold continue - if contingency_matrix.shape == (2, 2) and total < 7: - reason = f"Skipping term with only {total} observations (not powered for 2x2)" - results[idx] = PhenotypeMtcResult.fail(reason) + total_count = contingency_matrix.sum().sum() + if contingency_matrix.shape == (2, 2) and total_count < self._min_observations_for_2_by_2: + results[idx] = self._not_powered_for_2_by_2 + continue + elif contingency_matrix.shape == (2, 3) and total_count < self._min_observations_for_2_by_3: + results[idx] = self._not_powered_for_2_by_3 continue if not HpoMtcFilter.some_cell_has_greater_than_one_count( counts=contingency_matrix, ph_predicate=ph_predicate, ): - reason = "Skipping term because no genotype has more than one observed HPO count" - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = HpoMtcFilter.NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO continue elif HpoMtcFilter.genotypes_have_same_hpo_proportions( @@ -337,29 +418,40 @@ def filter( gt_predicate=gt_predicate, ph_predicate=ph_predicate, ): - reason = "Skipping term because all genotypes have same HPO observed proportions" - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = HpoMtcFilter.SKIPPING_SAME_OBSERVED_PROPORTIONS continue elif HpoMtcFilter.one_genotype_has_zero_hpo_observations( counts=contingency_matrix, gt_predicate=gt_predicate, ): - reason = "Skipping term because one genotype had zero observations" - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = HpoMtcFilter.SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS continue - # TODO: the code below actually did not work. - # for child_term_id in self._hpo.graph.get_children(term_id): - # if child_term_id in tested_counts_pf: - # if tested_counts_pf[child_term_id].equals(contingency_matrix): - # # TODO: should we make the match fuzzier by adding a tolerance instead of the exact matches? - # reason = "Child term with same counts previously tested" - # reason_for_filtering_out[reason] += 1 - # continue - + # Check if the term has exactly one child with a very similar number of individuals + # in the genotype and phenotype groups. + child_contingency_matrix = None + for child in self._hpo.graph.get_children(term_id): + if child in p_to_idx: + child_idx = p_to_idx[child] + if child_contingency_matrix is None: + child_contingency_matrix = counts[child_idx] + else: + # Found 2nd child of this term. + # We always want to test a term with + # >1 children in the target set. + child_contingency_matrix = None + break + + if child_contingency_matrix is not None: + if (contingency_matrix - child_contingency_matrix).abs().max(axis=None) < 1: + # Do not test if the count is exactly the same to the counts in the only child term. + results[idx] = HpoMtcFilter.SAME_COUNT_AS_THE_ONLY_CHILD + continue + + # ## # The term should be tested if we get here. - results[idx] = self._ok + results[idx] = PhenotypeMtcFilter.OK # There should be no `None` elements in `results` at this point. if any(r is None for r in results): @@ -375,6 +467,18 @@ def filter( # Ignoring the type hint, because we checked the type match above. return tuple(results) # type: ignore + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + return ( + PhenotypeMtcFilter.OK, + HpoMtcFilter.SKIPPING_GENERAL_TERM, + HpoMtcFilter.SKIPPING_NON_PHENOTYPE_TERM, + self._below_frequency_threshold, + self._not_powered_for_2_by_2, + HpoMtcFilter.NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO, + HpoMtcFilter.SKIPPING_SAME_OBSERVED_PROPORTIONS, + HpoMtcFilter.SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS, + ) + def filter_method_name(self) -> str: return "HPO MTC filter" diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py index 75b65866d..256c3fb3e 100644 --- a/src/gpsea/view/_phenotype_analysis.py +++ b/src/gpsea/view/_phenotype_analysis.py @@ -11,6 +11,8 @@ def summarize_hpo_analysis( """ Create a dataframe with counts, frequencies, and p values for the tested HPO terms. + The HPO terms that were not tested will *not* be included in the frame. + :param hpo: HPO data. :param result: the HPO term analysis results to show. """ @@ -51,10 +53,12 @@ def summarize_hpo_analysis( # Last, sort by corrected p value or just p value df = df.set_index(labeled_idx) + # and only report the tested HPO terms + with_p_value = df[("", p_val_col_name)].notna() if result.corrected_pvals is not None: - return df.sort_values(by=[("", corrected_p_val_col_name), ("", p_val_col_name)]) + return df.sort_values(by=[("", corrected_p_val_col_name), ("", p_val_col_name)]).loc[with_p_value] else: - return df.sort_values(by=("", p_val_col_name)) + return df.sort_values(by=("", p_val_col_name)).loc[with_p_value] def format_term_id( diff --git a/src/gpsea/view/_stats.py b/src/gpsea/view/_stats.py index b232ad151..5d9e1ab18 100644 --- a/src/gpsea/view/_stats.py +++ b/src/gpsea/view/_stats.py @@ -43,12 +43,19 @@ def _prepare_context( counts = Counter() for result in report.mtc_filter_results: if result.is_filtered_out(): - counts[result.reason] += 1 + counts[result.mtc_issue] += 1 n_skipped = 0 - reason_to_count = list() - for reason, count in sorted(counts.items(), key=lambda item: item[1], reverse=True): - reason_to_count.append({"reason": reason, "count": count}) + issue_to_count = list() + for mtc_issue, count in sorted( + counts.items(), + key=lambda issue2count: (issue2count[0].code, issue2count[0].reason) + ): + issue_to_count.append({ + "code": mtc_issue.code, + "reason": mtc_issue.reason, + "count": count, + }) n_skipped += count n_all = len(report.phenotypes) @@ -61,5 +68,5 @@ def _prepare_context( "skipped_hpo_count": n_skipped, "tested_hpo_count": n_tested, "total_hpo_count": n_all, - "reason_to_count": reason_to_count, + "issue_to_count": issue_to_count, } diff --git a/src/gpsea/view/templates/stats.html b/src/gpsea/view/templates/stats.html index 2a75c055a..3fd48ed5e 100644 --- a/src/gpsea/view/templates/stats.html +++ b/src/gpsea/view/templates/stats.html @@ -57,10 +57,9 @@

Phenotype testing report

- {% for skipped in reason_to_count %} + {% for skipped in issue_to_count %} - - + diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index 02710efdf..88e085dd6 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -1,4 +1,3 @@ -from itertools import count import typing import hpotk diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py index d57b3ae29..f557a3e79 100644 --- a/tests/view/test_stats.py +++ b/tests/view/test_stats.py @@ -49,7 +49,7 @@ def hpo_term_analysis_result( gt_predicate=suox_gt_predicate, mtc_filter_name='Random MTC filter', mtc_filter_results=( - PhenotypeMtcResult.fail("Not too interesting"), + PhenotypeMtcResult.fail("RMF01", "Not too interesting"), PhenotypeMtcResult.ok(), ), mtc_name='fdr_bh', From 43b8ee4e630c7911a8e7f0d81fe11aa3152fed90 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 10 Sep 2024 16:00:46 +0200 Subject: [PATCH 2/3] Export `PhenotypeMtcIssue`. --- src/gpsea/analysis/mtc_filter/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpsea/analysis/mtc_filter/__init__.py b/src/gpsea/analysis/mtc_filter/__init__.py index 4497ac343..57f0d6149 100644 --- a/src/gpsea/analysis/mtc_filter/__init__.py +++ b/src/gpsea/analysis/mtc_filter/__init__.py @@ -5,10 +5,10 @@ See :ref:`MTC filters ` section for more info. """ -from ._impl import PhenotypeMtcFilter, PhenotypeMtcResult +from ._impl import PhenotypeMtcFilter, PhenotypeMtcResult, PhenotypeMtcIssue from ._impl import UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter __all__ = [ - 'PhenotypeMtcFilter', 'PhenotypeMtcResult', + 'PhenotypeMtcFilter', 'PhenotypeMtcResult', 'PhenotypeMtcIssue', 'UseAllTermsMtcFilter', 'SpecifiedTermsMtcFilter', 'HpoMtcFilter', ] From 9c98210f7d5aa8af74ca74473953fb7432fbdb01 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 10 Sep 2024 16:01:46 +0200 Subject: [PATCH 3/3] Improve pydoc. --- src/gpsea/analysis/mtc_filter/_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py index 332a53a21..ad691c403 100644 --- a/src/gpsea/analysis/mtc_filter/_impl.py +++ b/src/gpsea/analysis/mtc_filter/_impl.py @@ -35,7 +35,7 @@ class PhenotypeMtcResult: The phenotype can either pass the filter, in order to be included in the downstream analysis (:meth:`is_passed`) of be filtered out (:meth:`is_filtered_out`) in which case :attr:`mtc_issue` with more context - regarding culprit must be available. + regarding the culprit must be available. """ @staticmethod
Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.
Code
TODOSkipping term with only 2 observations (not powered for 2x2)51
TODOSkipping term because all genotypes have same HPO observed proportions50
TODOSkipping general term44
TODOSkipping term with only 3 observations (not powered for 2x2)27
TODOSkipping term with only 6 observations (not powered for 2x2)19HMF01Skipping term with maximum frequency that was less than threshold 0.210
TODOSkipping term with only 4 observations (not powered for 2x2)16HMF02Skipping term because no genotype has more than one observed HPO count4
TODOSkipping term with maximum frequency that was less than threshold 0.210HMF03Skipping term because of a child term with the same individual counts1
TODOSkipping term with only 5 observations (not powered for 2x2)10HMF04Skipping term because all genotypes have same HPO observed proportions50
TODOSkipping term with only 1 observations (not powered for 2x2)7HMF05Skipping term because one genotype had zero observations5
TODOSkipping term because one genotype had zero observations5HMF06Skipping term with less than 7 observations (not powered for 2x2)130
TODOSkipping term because no genotype has more than one observed HPO count4HMF08Skipping general term44
Reason Count
TODO{{ skipped.code }} {{ skipped.reason }} {{ skipped.count }}