diff --git a/docs/_static/genophenocorr.css b/docs/_static/genophenocorr.css index 829c2d512..00a18c905 100644 --- a/docs/_static/genophenocorr.css +++ b/docs/_static/genophenocorr.css @@ -1,3 +1,3 @@ .wy-nav-content { - max-width: 55% !important; + max-width: 1200px !important; } \ No newline at end of file diff --git a/docs/img/tutorial/tbx5_protein_diagram.png b/docs/img/tutorial/tbx5_protein_diagram.png new file mode 100644 index 000000000..a4b146102 Binary files /dev/null and b/docs/img/tutorial/tbx5_protein_diagram.png differ diff --git a/docs/index.rst b/docs/index.rst index 673d15ae1..b36cb333f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,6 +1,6 @@ -============= -genophenocorr -============= +===== +GPSEA +===== The concept of phenotype denote the observable attributes of an individual, but in @@ -10,9 +10,9 @@ A key question in biology and human genetics concerns the relationships between genetics, the focus is generally placed on the study of whether specific disease-causing alleles are associated with specific phenotypic manifestations of the disease. -`genophenocorr` is a Python package designed to support genotype-phenotype correlation analysis. -The input to `genophenocorr` is a collection of `Global Alliance for Genomics and Health (GA4GH) Phenopackets `_. -`genophenocorr` ingests data from these phenopackets and performs analysis of the correlation of specific variants, +`GPSEA` (genotypes and phenotypes - study and evaluation of associations) is a Python package designed to support genotype-phenotype correlation analysis. +We pronounce GPSEA as "G"-"P"-"C". The input to `GPSEA` is a collection of `Global Alliance for Genomics and Health (GA4GH) Phenopackets `_. +`gpsea` ingests data from these phenopackets and performs analysis of the correlation of specific variants, variant types (e.g., missense vs. premature termination codon), or variant location in protein motifs or other features. The phenotypic abnormalities are represented by `Human Phenotype Ontology (HPO) `_ terms. Statistical analysis is performed using a `Fisher Exact Test `_, @@ -35,11 +35,11 @@ We provide recommended reading for background on the study of genotype-phenotype Feedback -------- -The best place to leave feedback, ask questions, and report bugs is the `genophenocorr Issue Tracker `_. +The best place to leave feedback, ask questions, and report bugs is the `GPSEA Issue Tracker `_. .. toctree:: :caption: Installation & Tutorial - :name: tutorial + :name: index-toc :maxdepth: 1 :hidden: diff --git a/docs/report/tbx5_cohort_info.html b/docs/report/tbx5_cohort_info.html new file mode 100644 index 000000000..23038359d --- /dev/null +++ b/docs/report/tbx5_cohort_info.html @@ -0,0 +1,334 @@ + + + + + Cohort + + + + +

genophenocorr cohort analysis

+

Successfully ingested 156 individuals.

+ +

No errors encountered.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Top 10 HPO Terms

+ A total of 116 HPO terms were used to annotated the cohort. +
HPO TermIDSeen in n individuals
Atrial septal defectHP:000163150
Ventricular septal defectHP:000162941
Hypoplasia of the radiusHP:000298440
Triphalangeal thumbHP:000119936
Absent thumbHP:000977732
Short thumbHP:000977832
Abnormal carpal morphologyHP:000119130
Secundum atrial septal defectHP:000168427
Absent radiusHP:000397415
Cardiac conduction abnormalityHP:003154614
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Top 10 Variants

+ Variants are shown according to NM_181486.4. A total of 156 unique variants were identified in the cohort. +
CountVariant keyVariant NameProtein VariantVariant Class
2212_114385521_114385521_C_Tc.710G>Ap.Arg237GlnMISSENSE_VARIANT
2012_114401830_114401830_C_Tc.238G>Ap.Gly80ArgMISSENSE_VARIANT
812_114385563_114385563_G_Ac.668C>Tp.Thr223MetMISSENSE_VARIANT
612_114398675_114398675_G_Tc.408C>Ap.Tyr136TerSTOP_GAINED
512_114398682_114398682_C_CGc.400dupp.Arg134ProfsTer49FRAMESHIFT_VARIANT
512_114403792_114403792_C_CGc.106_107insCp.Ser36ThrfsTer25FRAMESHIFT_VARIANT
512_114399514_114399514_A_Cc.361T>Gp.Trp121GlyMISSENSE_VARIANT, SPLICE_REGION_VARIANT
412_114385474_114385474_A_Gc.755+2T>CNoneSPLICE_DONOR_VARIANT
412_114398656_114398656_C_CGc.426dupp.Ala143ArgfsTer40FRAMESHIFT_VARIANT
412_114366360_114366360_C_Tc.787G>Ap.Val263MetMISSENSE_VARIANT
+ + + + + + + + + + + + + + + + +
+

Diseases

+
Disease NameDisease IDAnnotation Count
Holt-Oram syndromeOMIM:142900156
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Variant categories for NM_181486.4

+
Variant effectAnnotation Count
FRAMESHIFT_VARIANT38
MISSENSE_VARIANT85
STOP_GAINED19
SPLICE_REGION_VARIANT10
SPLICE_ACCEPTOR_VARIANT2
SPLICE_DONOR_VARIANT7
SPLICE_DONOR_5TH_BASE_VARIANT2
INTRON_VARIANT2
INFRAME_INSERTION2
STOP_RETAINED_VARIANT2
INFRAME_DELETION1
+ + + + + \ No newline at end of file diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv new file mode 100644 index 000000000..db0572a09 --- /dev/null +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -0,0 +1,18 @@ +"Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,, +,Count,Percent,Count,Percent,p value,Corrected p value +Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,5.6190936213143254e-05,0.0008990549794102921 +Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.00043478260869565214,0.003478260869565217 +Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.0036231884057971015,0.014492753623188406 +Heart block [HP:0012722],0/22,0%,2/2,100%,0.0036231884057971015,0.014492753623188406 +Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.005628510156750059,0.018011232501600187 +Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.01349527665317139,0.03598740440845704 +Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.02560162393963452,0.058517997576307476 +Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.07440639019586388,0.14881278039172777 +Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.1424522583078588,0.2532484592139712 +Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.1687456462342971,0.2699930339748754 +Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.42857142857142855,0.6233766233766234 +Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.5714285714285713,0.7619047619047618 +Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,0.7735491022101784,0.9520604334894502 +Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 +Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 +Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 diff --git a/docs/report/tbx5_mtc_report.html b/docs/report/tbx5_mtc_report.html new file mode 100644 index 000000000..175a02451 --- /dev/null +++ b/docs/report/tbx5_mtc_report.html @@ -0,0 +1,155 @@ + + + + + Cohort + + + + +

Phenotype testing report

+

Phenotype MTC filter: HPO MTC filter

+

Multiple testing correction: fdr_bh

+

Performed statistical tests for 15 out of the total of 259 HPO terms.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.
CodeReasonCount
TODOSkipping term because all genotypes have same HPO observed proportions42
TODOSkipping general term41
TODOSkipping term with only 0 observations (not powered for 2x2)41
TODOSkipping term with only 2 observations (not powered for 2x2)25
TODOSkipping term with only 1 observations (not powered for 2x2)24
TODOSkipping term with only 3 observations (not powered for 2x2)22
TODOSkipping term with only 4 observations (not powered for 2x2)14
TODOSkipping term with only 6 observations (not powered for 2x2)13
TODOSkipping term with maximum frequency that was less than threshold 0.210
TODOSkipping term with only 5 observations (not powered for 2x2)4
TODOSkipping term because no genotype has more than one observed HPO count3
TODOSkipping non-target term3
TODOSkipping term because one genotype had zero observations2
+ + \ No newline at end of file diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 83894c00d..a68557df0 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -4,115 +4,263 @@ Tutorial ======== -The tutorial demonstrates how to load an example Phenopacket cohort and perform genotype-phenotype analysis. +Here we demonstrate an end-to-end genotype-phenotype analysis with GPSEA. +The tutorial illustrates just one of the many ways GPSEA can be used to characterize genotype-phenotype correlations. +The :ref:`user-guide` contains details about additional methods and functionalities. -Set up analysis -^^^^^^^^^^^^^^^ -`genophenocorr` needs HPO to do the analysis. Let's load the ontology: +The tutorial presents an analysis of a cohort of individuals with pathogenic variants in *TBX5* leading to +`Holt-Oram syndrome MIM:142900 `_. -.. doctest:: tutorial +Holt-Oram syndrome is an autosomal dominant disorder characterized by +upper limb defects, congenital heart defects, and arrhythmias (`PMID:38336121 `_). +It has been observed in the literature that congenital defects of the ventricular and atrial septum are more +common in the truncating than in the missense variants (`PMID:30552424 `_). +Additionally, upper limb defects are more frequent in patients with protein-truncating variants (`PMID:38336121 `_). - >>> import os - >>> import hpotk - >>> hpo = hpotk.load_minimal_ontology(os.path.join('docs', 'data', 'hp.toy.json')) +To perform the analysis, we curated the literature and created on `GA4GH phenopacket `_ for each +affected individual. The phenopackets are made available in `Phenopacket Store `_. -.. tip:: - Use the latest HPO which you can get at `http://purl.obolibrary.org/obo/hp.json` -TODO - move the code from `workflow` and the notebook here. +The analysis +~~~~~~~~~~~~ -Prepare samples -^^^^^^^^^^^^^^^ +For the analysis, the `MANE `_ transcript +(i.e., the "main" biomedically relevant transcript of a gene) should be chosen unless +there is a specific reason not to (which should occur rarely if at all). -Now we need some samples. To keep things simple in this tutorial, we will use a toy cohort that is shipped -with the package: +In the case of *TBX5* the MANE transcript is `NM_181486.4`. Note that the trascript identifier (`NM_181486`) and the version (`4`) are both required. +A good way to find the MANE transcript is to search on the gene symbol (e.g., *TBX5*) in `ClinVar `_ and to +choose a variant that is specifically located in the gene. The MANE transcript will be displayed here (e.g., `NM_181486.4(TBX5):c.1221C>G (p.Tyr407Ter) +`_). -.. doctest:: tutorial +We additionally need the corresponding protein identifier. +A good way to find this is to search on the transcript id in `NCBI Nucleotide `_. +In our case, search on `NM_181486.4` will bring us to `this page `_. +If we search within this page for `"NP_"`, this will bring us to the +corresponding protein accession `NP_852259.1`. - >>> from genophenocorr.data import get_toy_cohort - >>> cohort = get_toy_cohort() +>>> cohort_name = 'TBX5' +>>> tx_id = 'NM_181486.4' +>>> px_id = 'NP_852259.1' -.. seealso:: - See :ref:`input-data` section to learn about preparing your data for the analysis. +Load HPO +^^^^^^^^ -We can then view the data using the list commands. +GPSEA needs HPO to do the analysis. +We use HPO toolkit to load HPO version `v2023-10-09`: -.. doctest:: tutorial +>>> import hpotk +>>> ontology_store = hpotk.configure_ontology_store() +>>> hpo = ontology_store.load_minimal_hpo(release='v2023-10-09') - >>> sorted(cohort.get_patient_ids()) - ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] - >>> sorted(cohort.list_present_phenotypes()) - [('HP:0001166', 14), ('HP:0001250', 20), ('HP:0001257', 17)] - >>> sorted(cohort.list_all_variants()) - [('1_281_281_A_G', 16), ('1_361_363_TTC_T', 13)] - >>> sorted(cohort.list_all_proteins()) - [('NP_09876.5', 29)] - >>> tx_dict = cohort.variant_effect_count_by_tx(tx_id='NM_1234.5') - >>> sorted(tx_dict['NM_1234.5'].items()) - [('FRAMESHIFT_VARIANT', 1), ('MISSENSE_VARIANT', 1)] +.. tip:: -Using the counts, we can choose and run what analyses we want. + Use the latest HPO release by omitting the `release` option. -For instance, we can partition the patients into two groups based on presence/absence of a *missense* variant: +Prepare cohort +^^^^^^^^^^^^^^ -.. doctest:: tutorial +Now we will load the samples to analyze. We will use the cohort of 156 individuals with mutations in *TBX5* +whose clinical signs and symptoms were encoded into HPO terms +and stored in `Phenopacket Store `_. - >>> from genophenocorr.analysis.predicate.genotype import VariantPredicates - >>> from genophenocorr.model import VariantEffect +>>> from ppktstore.registry import configure_phenopacket_registry +>>> phenopacket_registry = configure_phenopacket_registry() +>>> with phenopacket_registry.open_phenopacket_store('0.1.18') as ps: +... phenopackets = tuple(ps.iter_cohort_phenopackets(cohort_name)) +>>> len(phenopackets) +156 - >>> missense_in_tx = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id='NM_1234.5') +We loaded 156 phenopackets which need further preprocessing to prepare for the analysis. +We will compute functional annotations for the mutations and then include the individuals into +a :class:`~genophenocorr.model.Cohort`: -We created a predicate `missense_in_tx` that tests -if a variant leads to an aminoacid change in a transcript `NM_1234.5`. -We will use the predicate in the downstream analysis to assign the patients -into a group and test for genotype-phenotype correlation between the groups. +>>> from genophenocorr.preprocessing import configure_caching_cohort_creator, load_phenopackets +>>> cohort_creator = configure_caching_cohort_creator(hpo) +>>> cohort, validation = load_phenopackets( # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE +... phenopackets=phenopackets, +... cohort_creator=cohort_creator, +... ) +Patients Created: ... -.. note:: +and we will check that there are no Q/C issues: - There are many other ways to set up a predicate for testing - for genotype-phenotype correlation. - See the :ref:`predicates` section to learn more about building - a predicate for the analysis of interest. +>>> validation.summarize() # doctest: +SKIP +Validated under none policy +No errors or warnings were found -Next, we will prepare and run the genotype-phenotype analysis -to test the correlation between phenotypes and presence of a missense variant. +We loaded the patient data into a `cohort` which is ready for the next steps. -.. doctest:: tutorial +.. seealso:: - >>> from genophenocorr.analysis import configure_cohort_analysis - >>> from genophenocorr.analysis.predicate import PatientCategories # TODO - explain the predicate or update the API - - >>> cohort_analysis = configure_cohort_analysis(cohort, hpo) - >>> missense = cohort_analysis.compare_hpo_vs_genotype(missense_in_tx) - -We prepared the `cohort_analysis` and we ran the analysis to test for correlation -between the clinical signs and symptoms of patients encoded into HPO terms -and presence of a missense variant in the patient. - -We stored the results into `missense` variable. We can summarize the results -by showing a data frame with results for each tested HPO term. -We show the first two terms here: - - >>> import pandas as pd - >>> pd.set_option('expand_frame_repr', False) - >>> summary_df = missense.summarize(hpo, PatientCategories.YES) - >>> summary_df.head(2) # doctest: +NORMALIZE_WHITESPACE - MISSENSE_VARIANT on NM_1234.5 Yes No - Count Percent Count Percent p value Corrected p value - Arachnodactyly [HP:0001166] 13/16 81% 1/10 10% 0.000781 0.020299 - Spasticity [HP:0001257] 11/16 69% 6/10 60% 0.692449 1.000000 + Here we show how to create a :class:`~genophenocorr.model.Cohort` from phenopackets. + See :ref:`input-data` section to learn how to create a cohort from another inputs. -.. - We're showing just 1 row above. This is due to 2-.. rows all having corrected p value of `1.000` resulting - in unstable sort order. We can show more rows with a better cohort, as soon as we have it! +Explore cohort +^^^^^^^^^^^^^^ -.. +We can now explore the cohort to see how many patients are included. - We can show analysis for `VariantEffect.FRAMESHIFT_VARIANT` as well.. +>>> from genophenocorr.view import CohortViewable +>>> viewer = CohortViewable(hpo) +>>> report = viewer.process(cohort=cohort, transcript_id=tx_id) +>>> with open('docs/report/tbx5_cohort_info.html', 'w') as fh: # doctest: +SKIP +... _ = fh.write(report) -The table presents the HPO terms that annotate the cohort members and report their counts and p values -for each genotype group. The rows are sorted by the corrected p value in ascending order. +.. raw:: html + :file: report/tbx5_cohort_info.html + +.. note:: + + The report can also be displayed directly in a Jupyter notebook by running:: + + from IPython.display import HTML, display + display(HTML(report)) + +Now we can show the distribution of variants with respect to the encoded protein. +We first obtain `tx_coordinates` (:class:`~genophenocorr.model.TranscriptCoordinates`) +and `protein_meta` (:class:`~genophenocorr.model.ProteinMetadata`) +with information about the transcript and protein "anatomy": + +>>> from genophenocorr.model.genome import GRCh38 +>>> from genophenocorr.preprocessing import configure_protein_metadata_service, VVMultiCoordinateService +>>> txc_service = VVMultiCoordinateService(genome_build=GRCh38) +>>> pms = configure_protein_metadata_service() +>>> tx_coordinates = txc_service.fetch(tx_id) +>>> protein_meta = pms.annotate(px_id) + +and we follow with plotting the diagram of the mutations on the protein: + +>>> from genophenocorr.view import ProteinVisualizer +>>> import matplotlib.pyplot as plt +>>> fig, ax = plt.subplots(figsize=(15, 8)) +>>> visualizer = ProteinVisualizer() +>>> visualizer.draw_protein_diagram( +... tx_coordinates, +... protein_meta, +... cohort, +... ax=ax, +... ) +>>> fig.tight_layout() +>>> fig.savefig('docs/img/tutorial/tbx5_protein_diagram.png') # doctest: +SKIP + +.. image:: /img/tutorial/tbx5_protein_diagram.png + :alt: TBX5 protein diagram + :align: center + :width: 600px + + +Prepare genotype and phenotype predicates +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We will create a predicate to bin patients into group +depending on presence of a missense and frameshift variant to test +if there is a difference between frameshift and non-frameshift variants +in the individuals of the *TBX5* cohort. + +>>> from genophenocorr.model import VariantEffect +>>> from genophenocorr.analysis.predicate.genotype import VariantPredicates, groups_predicate +>>> gt_predicate = groups_predicate( +... predicates=( +... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), +... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) +... ), +... group_names=('Missense', 'Frameshift'), +... ) +>>> gt_predicate.get_question() +'Genotype group: Missense, Frameshift' + +.. note:: + + There are many other ways to set up a predicate for testing + for a GP correlation. + See the :ref:`predicates` section to learn more about building + a predicate of interest. + + +By default, GPSEA will perform one hypothesis test for each HPO term used to annotate more than one individual in the cohort. +This also includes the terms implied by the ontology "true path rule", +which states that presence of a term +(e.g., `Ventricular septal defect `_) +implies presence of all its ancestor terms +(e.g., `Abnormal ventricular septum morphology `_, +`Abnormal cardiac septum morphology `_, +`Abnormal cardiac ventricle morphology `_, ...). +However, testing multiple hypothesis increases the chance of receiving false positive result, +and multiple testing correction must be applied. +See :ref:`mtc` for information about how to perform multiple testing correction with GPSEA. + +For general use, we recommend using a combination +of a *Phenotype MTC filter* (:class:`~genophenocorr.analysis.PhenotypeMtcFilter`) with a *multiple testing correction*. +Phenotype MTC filter chooses the HPO terms to test according to several heuristics, which +reduce the multiple testing burden and focus the analysis +on the most interesting terms (see :ref:`HPO MTC filter ` for more info). +Then the multiple testing correction, such as Bonferroni or Benjamini-Hochberg, +is used to control the family-wise error rate or the false discovery rate. + +Here we use HPO MTC filter (:meth:`~genophenocorr.analysis.CohortAnalysisConfiguration.hpo_mtc_strategy`) +along with Benjamini-Hochberg procedure (:meth:`~genophenocorr.analysis.CohortAnalysisConfiguration.pval_correction`): + +>>> from genophenocorr.analysis import configure_cohort_analysis, CohortAnalysisConfiguration +>>> config = CohortAnalysisConfiguration() +>>> config.hpo_mtc_strategy() +>>> config.pval_correction = 'fdr_bh' +>>> analysis = configure_cohort_analysis( +... cohort=cohort, +... hpo=hpo, +... config=config, +... ) + +Now we can perform the analysis and investigate the results. + +>>> result = analysis.compare_genotype_vs_cohort_phenotypes(gt_predicate) +>>> result.total_tests +16 + +We only tested 16 HPO terms. This is despite the individuals being collectively annotated with +259 direct and indirect HPO terms + +>>> from genophenocorr.analysis import prepare_hpo_terms_of_interest +>>> terms = prepare_hpo_terms_of_interest(hpo, cohort.all_patients, min_n_of_patients_with_term=2) +>>> len(terms) +259 + +We can show the reasoning behind *not* testing 243 (`259 - 16`) HPO terms +by exploring the phenotype MTC filtering report. + +>>> from genophenocorr.view import MtcStatsViewer +>>> mtc_viewer = MtcStatsViewer() +>>> mtc_report = mtc_viewer.process(result.mtc_filter_report) +>>> with open('docs/report/tbx5_mtc_report.html', 'w') as fh: # doctest: +SKIP +... _ = fh.write(mtc_report) + +.. raw:: html + :file: report/tbx5_mtc_report.html + +.. + + TODO: + Show how to write out the tested HPO terms. + +and these are the HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: + +>>> from genophenocorr.analysis.predicate import PatientCategories +>>> summary_df = result.summarize(hpo, PatientCategories.YES) +>>> summary_df.to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP + +.. csv-table:: *TBX5* frameshift vs missense + :file: report/tbx5_frameshift_vs_missense.csv + :header-rows: 2 + +We see that several HPO terms are significantly associated +with presence of a frameshift variant in *TBX5*. +For example, `Ventricular septal defect `_ +was observed in 31/60 (52%) patients with a missense variant +but it was observed in 19/19 (100%) patients with a frameshift variant. +Fisher exact test computed a p value of `~0.0000562` +and the p value corrected by Benjamini-Hochberg procedure +is `~0.00112`. diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index a73e8e079..b56d45719 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -158,7 +158,7 @@ The genotype predicate will bin the patient into two groups: a point mutation gr ... group_names=('Point', 'LoF'), ... ) >>> gt_predicate.get_question() -'What group does the patient belong to: Point, LoF' +'Genotype group: Point, LoF' Now phenotype predicate. The authors divide the patients into groups according to the count of structural defects in these groups: diff --git a/src/genophenocorr/analysis/__init__.py b/src/genophenocorr/analysis/__init__.py index 75d6d6080..0b4f157ba 100644 --- a/src/genophenocorr/analysis/__init__.py +++ b/src/genophenocorr/analysis/__init__.py @@ -4,6 +4,7 @@ from ._config import CohortAnalysisConfiguration, configure_cohort_analysis, configure_default_protein_metadata_service, MtcStrategy from ._gp_analysis import apply_predicates_on_patients from ._mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter +from ._util import prepare_hpo_terms_of_interest __all__ = [ 'configure_cohort_analysis', @@ -13,4 +14,5 @@ 'PhenotypeMtcFilter', 'UseAllTermsMtcFilter', 'SpecifiedTermsMtcFilter', 'HpoMtcFilter', 'apply_predicates_on_patients', 'configure_default_protein_metadata_service', + 'prepare_hpo_terms_of_interest', ] diff --git a/src/genophenocorr/analysis/_api.py b/src/genophenocorr/analysis/_api.py index ad0cb1ea8..dc0d13ad0 100644 --- a/src/genophenocorr/analysis/_api.py +++ b/src/genophenocorr/analysis/_api.py @@ -9,7 +9,7 @@ from genophenocorr.preprocessing import ProteinMetadataService from .predicate import PolyPredicate, GenotypePolyPredicate, PatientCategory from .predicate.genotype import VariantPredicate, ProteinPredicates -from .predicate.phenotype import P, CountingPhenotypeScorer +from .predicate.phenotype import P, PhenotypePolyPredicate, CountingPhenotypeScorer PatientsByHPO = namedtuple('PatientsByHPO', field_names=['all_with_hpo', 'all_without_hpo']) @@ -24,19 +24,19 @@ def __init__( filter_name: str, mtc_name: str, filter_results_map: typing.Mapping[str, int], - term_count: int, + n_terms_before_filtering: int, ): """ Args: filter_name: name of the MTC filter strategy (e.g. `heuristic sampler`) mtc_name: name of the MTC function (e.g. `bonferroni`) filter_results_map: mapping with reasons for filtering out a term as keys, and counts of filtered terms as values - term_count: the number of HPO terms before filtering + n_terms_before_filtering: the number of HPO terms before filtering """ self._filter_name = filter_name self._mtc_name = mtc_name self._results_map = filter_results_map - self._term_count = term_count + self._n_terms_before_filtering = n_terms_before_filtering @property def filter_method(self) -> str: @@ -63,12 +63,11 @@ def mtc_method(self) -> str: return self._mtc_name @property - def n_terms_tested(self) -> int: + def n_terms_before_filtering(self) -> int: """ - Returns: - the number of HPO terms before filtering. + Get the number of terms before filtering. """ - return self._term_count + return self._n_terms_before_filtering class GenotypePhenotypeAnalysisResult: @@ -149,7 +148,7 @@ def total_tests(self) -> int: """ Get total count of tests that were run for this analysis. """ - return len(self.all_counts) + return len(self._all_counts) @property def mtc_filter_report(self) -> typing.Optional[HpoMtcReport]: @@ -327,14 +326,15 @@ def compare_hpo_vs_recessive_genotype( @abc.abstractmethod def compare_hpo_vs_genotype_groups( self, - first: VariantPredicate, - second: VariantPredicate, + predicates: typing.Iterable[VariantPredicate], + group_names: typing.Iterable[str], ) -> GenotypePhenotypeAnalysisResult: """ - Bin patients according to a presence of at least one allele that matches `first` or `second` predicate - and test for genotype-phenotype correlations between the groups. + Bin patients according to a presence of at least one allele that matches + any of the provided `predicates` and test for genotype-phenotype correlations + between the groups. - Note, the patients that pass testing by both predicates are *OMITTED* from the analysis! + Note, the patients that pass testing by >1 genotype predicate are *OMITTED* from the analysis! """ pass @@ -380,6 +380,35 @@ def compare_genotype_vs_phenotype_score( """ pass + @abc.abstractmethod + def compare_genotype_vs_cohort_phenotypes( + self, + gt_predicate: GenotypePolyPredicate, + ) -> GenotypePhenotypeAnalysisResult: + # TODO: if we had access to the cohort, it would be trivial to prepare + # all phenotype predicates and to call `self.compare_genotype_vs_phenotypes`. + pass + + @abc.abstractmethod + def compare_genotype_vs_phenotypes( + self, + gt_predicate: GenotypePolyPredicate, + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], + ): + """ + All analysis functions go through this function. + + The genotype predicate will partition the individuals into non-overlapping groups + along the genotype axis. + The phenotype predicates represent the phenotypes we want to test. + Less phenotypes may actually be tested due to :class:`genophenocorr.analysis.PhenotypeMtcFilter`. + + Args: + gt_predicate: a predicate for binning the individuals along the genotype axis + pheno_predicates: phenotype predicates for test the individuals along the phenotype axis + """ + pass + @staticmethod def _check_min_perc_patients_w_hpo(min_perc_patients_w_hpo: typing.Union[int, float], cohort_size: int) -> float: diff --git a/src/genophenocorr/analysis/_config.py b/src/genophenocorr/analysis/_config.py index 9c2ac43ee..20b4fb1f6 100644 --- a/src/genophenocorr/analysis/_config.py +++ b/src/genophenocorr/analysis/_config.py @@ -63,7 +63,6 @@ class CohortAnalysisConfiguration: ============================== ======================= ========================================= Option Type Default value ============================== ======================= ========================================= - ``missing_implies_excluded`` `bool` `False` ``pval_correction`` `str` `bonferroni` ``min_n_patients_with_term`` `int` `2` ``mtc_alpha`` `float` `0.05` @@ -75,7 +74,6 @@ class CohortAnalysisConfiguration: def __init__(self): self._logger = logging.getLogger(__name__) - self._missing_implies_excluded = False self._pval_correction = 'bonferroni' self._mtc_alpha = .05 self._min_n_patients_with_term = 2 @@ -84,26 +82,6 @@ def __init__(self): self._terms_to_test = None # # only relevant for SPECIFIED_TERMS strategy self._min_patients_w_hpo = None # # only relevant for HPO_MTC strategy - @property - def missing_implies_excluded(self) -> bool: - """ - `True` if we assume that a patient without a specific phenotype listed *does not* have the phenotype. - Otherwise, the only excluded phenotypes are those that are excluded explicitly. - """ - return self._missing_implies_excluded - - @missing_implies_excluded.setter - def missing_implies_excluded(self, missing_implies_excluded: bool): - """ - :param missing_implies_excluded: a `bool` to dictate is the missing phenotypic features should be considered - as excluded. - """ - if isinstance(missing_implies_excluded, bool): - self._missing_implies_excluded = missing_implies_excluded - else: - self._logger.warning('Ignoring invalid `missing_implies_excluded` value %s. Using %s.', - missing_implies_excluded, self._missing_implies_excluded) - @property def pval_correction(self) -> typing.Optional[str]: """ @@ -114,8 +92,10 @@ def pval_correction(self) -> typing.Optional[str]: @pval_correction.setter def pval_correction(self, pval_correction: typing.Optional[str]): """ - Set the method for p value correction. - See Statsmodels' `documentation `_ + Set the method for p value correction. + + See Statsmodels' + `documentation `_ for the acceptable values. :param pval_correction: a `str` with the name of the desired multiple testing correction method or `None` @@ -141,7 +121,7 @@ def min_n_patients_with_term(self, value: int): self._min_n_patients_with_term = value else: self._logger.warning( - 'Ignoring invalid `min_n_patients_with_term` value %s. Using %s', + 'Ignoring invalid `min_n_patients_with_term` value %s. Using %s', value, self._min_n_patients_with_term, ) @@ -162,8 +142,10 @@ def mtc_alpha(self, mtc_alpha: float): if isinstance(mtc_alpha, float) and 0. < mtc_alpha <= 1.: self._mtc_alpha = mtc_alpha else: - self._logger.warning('`mtc_alpha` should be a `float` in range `(0, 1]` but was %s. Keeping the previous value %s', - mtc_alpha, self._mtc_alpha) + self._logger.warning( + '`mtc_alpha` should be a `float` in range `(0, 1]` but was %s. Keeping the previous value %s', + mtc_alpha, self._mtc_alpha, + ) @property def include_sv(self) -> bool: @@ -312,7 +294,6 @@ def configure_cohort_analysis( hpo=hpo, protein_service=protein_metadata_service, gp_analyzer=gp_analyzer, - missing_implies_excluded=config.missing_implies_excluded, min_n_of_patients_with_term=config.min_n_patients_with_term, include_sv=config.include_sv, ) @@ -352,8 +333,8 @@ def configure_default_protein_metadata_service( cache_dir: typing.Optional[str] = None, ) -> ProteinMetadataService: """ - Create default protein metadata service that will cache the protein metadata - in current working directory under `.genophenocorr_cache/protein_cache` + Create default protein metadata service that will cache the protein metadata + in current working directory under `.genophenocorr_cache/protein_cache` and reach out to UNIPROT REST API if a cache entry is missing. """ cache_dir = _configure_cache_dir(cache_dir) diff --git a/src/genophenocorr/analysis/_gp_analysis.py b/src/genophenocorr/analysis/_gp_analysis.py index 55f4fac9f..e85471285 100644 --- a/src/genophenocorr/analysis/_gp_analysis.py +++ b/src/genophenocorr/analysis/_gp_analysis.py @@ -19,7 +19,7 @@ def apply_predicates_on_patients( gt_predicate: GenotypePolyPredicate, ) -> typing.Tuple[ typing.Collection[PatientCategory], - pd.Series, + typing.Mapping[P, int], typing.Mapping[P, pd.DataFrame], ]: """ @@ -39,7 +39,7 @@ def apply_predicates_on_patients( Returns: a tuple with 3 items: - a collection of unique :class:`PatientCategory` items that the patients were binned into - - a :class:`pd.Series` with mapping from a phenotype :class:`P` (e.g. an HPO term or a disease) + - a mapping from a phenotype :class:`P` (e.g. an HPO term or a disease) to an `int` with count of patients that could be binned according to the phenotype `P` - a mapping from phenotype :class:`P` to a data frame with counts of patients in i-th phenotype category and j-th genotype category where i and j are rows and columns of the data frame @@ -140,15 +140,21 @@ def analyze( categories, n_usable, all_counts = apply_predicates_on_patients( patients, pheno_predicates, gt_predicate, ) - original_phenotype_count = len(n_usable) + n_terms_before_filtering = len(n_usable) # 1.5) Filter terms for MTC - n_usable_filtered, all_counts_filtered, reason2count = self._mtc_filter.filter_terms_to_test(n_usable, all_counts) + n_usable_filtered, all_counts_filtered, reason2count = self._mtc_filter.filter_terms_to_test( + gt_predicate=gt_predicate, + n_usable=n_usable, + all_counts=all_counts, + ) if len(n_usable_filtered) == 0: raise ValueError("No phenotypes are left for the analysis after MTC filtering step") + assert len(n_usable_filtered) == len(all_counts_filtered) + # 2) Statistical tests - pheno_idx = pd.Index(n_usable_filtered.keys(), name='p_val') + pheno_idx = pd.Index(all_counts_filtered.keys(), name='p_val') pvals = pd.Series(float('nan'), index=pheno_idx, name='p value') for phenotype in pheno_idx: counts = all_counts_filtered[phenotype] @@ -166,7 +172,7 @@ def analyze( # 3) Multiple correction if self._correction is not None: _, pvals_corrected, _, _ = multitest.multipletests(pvals, alpha=self._mtc_alpha, method=self._correction) - corrected_idx = pd.Index(n_usable_filtered.keys(), name='p_val_corrected') + corrected_idx = pd.Index(all_counts_filtered.keys(), name='p_val_corrected') corrected_pvals_series = pd.Series( data=pvals_corrected, index=corrected_idx, name='Corrected p value', ) @@ -180,7 +186,7 @@ def analyze( filter_name=self._mtc_filter.filter_method_name(), mtc_name=mtc_method, filter_results_map=reason2count, - term_count=original_phenotype_count, + n_terms_before_filtering=n_terms_before_filtering, ) # 4) Wrap up diff --git a/src/genophenocorr/analysis/_gp_impl.py b/src/genophenocorr/analysis/_gp_impl.py index aebfd8643..178efcbe0 100644 --- a/src/genophenocorr/analysis/_gp_impl.py +++ b/src/genophenocorr/analysis/_gp_impl.py @@ -1,8 +1,6 @@ import logging import typing -from collections import Counter - import hpotk import pandas as pd @@ -19,6 +17,7 @@ from ._api import CohortAnalysis, GenotypePhenotypeAnalysisResult, PhenotypeScoreAnalysisResult from ._gp_analysis import GPAnalyzer +from ._util import prepare_hpo_terms_of_interest class GpCohortAnalysis(CohortAnalysis): @@ -28,7 +27,6 @@ def __init__( hpo: hpotk.MinimalOntology, protein_service: ProteinMetadataService, gp_analyzer: GPAnalyzer, - missing_implies_excluded: bool, min_n_of_patients_with_term: int, include_sv: bool = False, ): @@ -53,10 +51,8 @@ def __init__( self._hpo_terms_of_interest = prepare_hpo_terms_of_interest( hpo=self._hpo, patients=self._patient_list, - missing_implies_excluded=missing_implies_excluded, min_n_of_patients_with_term=min_n_of_patients_with_term, ) - self._missing_implies_excluded = missing_implies_excluded def compare_hpo_vs_genotype( self, @@ -69,7 +65,7 @@ def compare_hpo_vs_genotype( assert isinstance(predicate, VariantPredicate), \ f'{type(predicate)} is not an instance of `VariantPredicate`' bool_predicate = wrap_as_boolean_predicate(predicate) - return self._apply_poly_predicate_on_hpo_terms(bool_predicate) + return self.compare_genotype_vs_cohort_phenotypes(bool_predicate) def compare_hpo_vs_recessive_genotype( self, @@ -80,12 +76,12 @@ def compare_hpo_vs_recessive_genotype( and test for genotype-phenotype correlations. """ rec_predicate = wrap_as_recessive_predicate(predicate) - return self._apply_poly_predicate_on_hpo_terms(rec_predicate) + return self.compare_genotype_vs_cohort_phenotypes(rec_predicate) def compare_hpo_vs_genotype_groups( - self, - predicates: typing.Iterable[VariantPredicate], - group_names: typing.Iterable[str], + self, + predicates: typing.Iterable[VariantPredicate], + group_names: typing.Iterable[str], ) -> GenotypePhenotypeAnalysisResult: """ Bin patients according to a presence of at least one allele that matches @@ -98,7 +94,7 @@ def compare_hpo_vs_genotype_groups( predicates=predicates, group_names=group_names, ) - return self._apply_poly_predicate_on_hpo_terms(predicate) + return self.compare_genotype_vs_cohort_phenotypes(predicate) def compare_disease_vs_genotype( self, @@ -109,7 +105,7 @@ def compare_disease_vs_genotype( # This can be updated to any genotype poly predicate in future, if necessary. genotype_predicate = wrap_as_boolean_predicate(predicate) - return self._apply_poly_predicate(pheno_predicates, genotype_predicate) + return self.compare_genotype_vs_phenotypes(pheno_predicates, genotype_predicate) def _prepare_disease_predicates( self, @@ -169,27 +165,15 @@ def compare_genotype_vs_phenotype_score( p_value=float(result.pvalue), ) - def _apply_poly_predicate_on_hpo_terms( - self, - gt_predicate: GenotypePolyPredicate, + def compare_genotype_vs_cohort_phenotypes( + self, + gt_predicate: GenotypePolyPredicate, ) -> GenotypePhenotypeAnalysisResult: assert isinstance(gt_predicate, GenotypePolyPredicate), \ f'{type(gt_predicate)} is not an instance of `GenotypePolyPredicate`' pheno_predicates = self._prepare_phenotype_predicates() - return self._apply_poly_predicate(pheno_predicates, gt_predicate) - - # Make public, eventually convenience functions written - def _apply_poly_predicate( - self, - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], - gt_predicate: GenotypePolyPredicate, - ): - return self._gp_analyzer.analyze( - patients=self._patient_list, - pheno_predicates=pheno_predicates, - gt_predicate=gt_predicate, - ) + return self.compare_genotype_vs_phenotypes(gt_predicate, pheno_predicates) def _prepare_phenotype_predicates(self) -> typing.Sequence[PhenotypePolyPredicate[P]]: return tuple( @@ -199,74 +183,13 @@ def _prepare_phenotype_predicates(self) -> typing.Sequence[PhenotypePolyPredicat ) for query in self._hpo_terms_of_interest) - -def prepare_hpo_terms_of_interest( - hpo: hpotk.MinimalOntology, - patients: typing.Iterable[Patient], - missing_implies_excluded: bool, - min_n_of_patients_with_term: int, -) -> typing.Iterable[hpotk.TermId]: - present_count = Counter() - excluded_count = Counter() - - for patient in patients: - for pf in patient.phenotypes: - if pf.is_present: - # A present phenotypic feature must be counted in. - present_count[pf.identifier] += 1 - # and it also implies presence of its ancestors. - for anc in hpo.graph.get_ancestors(pf): - present_count[anc] += 1 - else: - # An excluded phenotypic feature - excluded_count[pf.identifier] += 1 - for desc in hpo.graph.get_descendants(pf): - # implies exclusion of its descendants. - excluded_count[desc] += 1 - - if missing_implies_excluded: - # We must do another pass to ensure that we account the missing features as excluded. - # `present_count` has all phenotypic features that were observed as present among the cohort members. - for pf in present_count: - for patient in patients: - pf_can_be_inferred_as_excluded_for_patient = False - for phenotype in patient.phenotypes: - if pf == phenotype.identifier: - # Patient is explicitly annotated with `pf`. No inference for this patient! - pf_can_be_inferred_as_excluded_for_patient = False - break - - if phenotype.is_present and hpo.graph.is_ancestor_of(pf, phenotype): - # The `pf` is implicitly observed in the patient. No inference for this patient! - pf_can_be_inferred_as_excluded_for_patient = False - break - elif phenotype.is_excluded and hpo.graph.is_descendant_of(pf, phenotype): - # The `pf` is implicitly excluded in the patient. No inference for this patient! - pf_can_be_inferred_as_excluded_for_patient = False - break - else: - # There is no implicit or explicit observation of `pf` for this patient. - # Therefore, we infer that it is excluded! - pf_can_be_inferred_as_excluded_for_patient = True - - if pf_can_be_inferred_as_excluded_for_patient: - excluded_count[pf] += 1 - for desc in hpo.graph.get_descendants(pf): - excluded_count[desc] += 1 - - total_count = Counter() - for term_id, count in present_count.items(): - total_count[term_id] += count - - for term_id, count in excluded_count.items(): - total_count[term_id] += count - - final_hpo = [] - for term_id in present_count: - # Keep the term if it is mentioned at least *n* times (incl. being excluded) - # in the cohort - n_all = total_count[term_id] - if n_all >= min_n_of_patients_with_term: - final_hpo.append(term_id) - - return tuple(final_hpo) + def compare_genotype_vs_phenotypes( + self, + gt_predicate: GenotypePolyPredicate, + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], + ) -> GenotypePhenotypeAnalysisResult: + return self._gp_analyzer.analyze( + patients=self._patient_list, + pheno_predicates=pheno_predicates, + gt_predicate=gt_predicate, + ) diff --git a/src/genophenocorr/analysis/_mtc_filter.py b/src/genophenocorr/analysis/_mtc_filter.py index 89ed01ef4..2025c0ed3 100644 --- a/src/genophenocorr/analysis/_mtc_filter.py +++ b/src/genophenocorr/analysis/_mtc_filter.py @@ -6,7 +6,7 @@ import hpotk import pandas as pd -from .predicate import PatientCategories +from .predicate import PatientCategories, PatientCategory, GenotypePolyPredicate class PhenotypeMtcFilter(metaclass=abc.ABCMeta): @@ -23,6 +23,7 @@ class PhenotypeMtcFilter(metaclass=abc.ABCMeta): @abc.abstractmethod def filter_terms_to_test( self, + gt_predicate: GenotypePolyPredicate, n_usable: typing.Mapping[hpotk.TermId, int], all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame], ) -> typing.Tuple[ @@ -36,9 +37,11 @@ def filter_terms_to_test( to lead to interesting statistical/analytical results. Args: + gt_predicate: the predicate used to bin patients into groups along the genotype axis n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients that could be binned according to the used genotype/phenotype predicate. - all_counts: a mapping from the :class:`hpotk.TermId` to + all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients + in the i-th phenotype (rows) and j-th genotype (column) group Returns: a tuple with three items: - a mapping from :class:`hpotk.TermId` -> @@ -49,7 +52,9 @@ def filter_terms_to_test( @abc.abstractmethod def filter_method_name(self) -> str: - """returns the name of the heuristic used to limit multiple testing""" + """ + Get a `str` with the MTC filter name to display for humans. + """ pass @@ -62,6 +67,7 @@ class UseAllTermsMtcFilter(PhenotypeMtcFilter): def filter_terms_to_test( self, + gt_predicate: GenotypePolyPredicate, n_usable: typing.Mapping[hpotk.TermId, int], all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame], ) -> typing.Tuple[ @@ -76,7 +82,7 @@ def filter_terms_to_test( return n_usable, all_counts, {} def filter_method_name(self) -> str: - return "test all terms" + return "All HPO terms" class SpecifiedTermsMtcFilter(PhenotypeMtcFilter): @@ -107,6 +113,7 @@ def __init__( def filter_terms_to_test( self, + gt_predicate: GenotypePolyPredicate, n_usable: typing.Mapping[hpotk.TermId, int], all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame], ) -> typing.Tuple[ @@ -116,21 +123,26 @@ def filter_terms_to_test( ]: """ Remove terms that are not members of the specific set of HPO terms to be tested. + Args: - n_usable: dictionary with HPO term ids seen in our cohort and their counts - all_counts: Dictionary with HPO term ids from cohort and DataFrames with detailed GPC counts + gt_predicate: the predicate used to bin patients into groups along the genotype axis + n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients + that could be binned according to the used genotype/phenotype predicate. + all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients + in the i-th phenotype (rows) and j-th genotype (column) group Returns: filtered versions of the two dictionaries above and dataframe with reasons for skipping """ filtered_n_usable = {} - filtered_all_counts = pd.Series() + filtered_all_counts = {} reason_for_filtering_out: typing.DefaultDict[str, int] = defaultdict(int) - for term_id in n_usable.keys(): + for term_id in all_counts.keys(): if term_id not in self._terms_to_test_set: reason_for_filtering_out["Skipping non-specified term"] += 1 continue + # if we get here, then the term is a member of our list of terms to be tested. filtered_n_usable[term_id] = n_usable[term_id] filtered_all_counts[term_id] = all_counts[term_id] @@ -138,7 +150,7 @@ def filter_terms_to_test( return filtered_n_usable, filtered_all_counts, reason_for_filtering_out def filter_method_name(self) -> str: - return "specify terms" + return "Specified terms MTC filter" class HpoMtcFilter(PhenotypeMtcFilter): @@ -239,6 +251,7 @@ def __init__( def filter_terms_to_test( self, + gt_predicate: GenotypePolyPredicate, n_usable: typing.Mapping[hpotk.TermId, int], all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame], ) -> typing.Tuple[ @@ -246,23 +259,33 @@ def filter_terms_to_test( typing.Mapping[hpotk.TermId, pd.DataFrame], typing.Mapping[str, int], ]: + """ + Args: + gt_predicate: the predicate used to bin patients into groups along the genotype axis + n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients + that could be binned according to the used genotype/phenotype predicate. + all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients + in the i-th phenotype (rows) and j-th genotype (column) group + """ filtered_n_usable = {} - filtered_all_counts = pd.Series() + filtered_all_counts = {} reason_for_filtering_out = defaultdict(int) tested_counts_pf = defaultdict( pd.DataFrame ) # key is an HP id, value is a tuple with counts, i.e., # iterate through the terms in the sorted order, starting from the leaves of the induced graph. - for term_id in self._get_ordered_terms(n_usable.keys()): + for term_id in self._get_ordered_terms(all_counts.keys()): if term_id in self._general_hpo_terms: reason_for_filtering_out["Skipping general term"] += 1 continue + if not self._hpo.graph.is_ancestor_of( hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY, term_id ): reason_for_filtering_out["Skipping non phenotype term"] += 1 continue + # get total number of observations if term_id not in all_counts: reason_for_filtering_out["Skipping non-target term"] += 1 @@ -276,24 +299,34 @@ def filter_terms_to_test( if max_freq < self._hpo_term_frequency_filter: reason = ( "Skipping term with maximum frequency " - + "that was less than threshold {self._hpo_term_frequency_filter}" + f"that was less than threshold {self._hpo_term_frequency_filter}" ) reason_for_filtering_out[reason] += 1 + continue if counts_frame.shape == (2, 2) and total < 7: reason = f"Skipping term with only {total} observations (not powered for 2x2)" reason_for_filtering_out[reason] += 1 continue + # todo -- similar for (3,2) if not HpoMtcFilter.some_cell_has_greater_than_one_count(counts_frame): reason = "Skipping term because no genotype has more than one observed HPO count" reason_for_filtering_out[reason] += 1 continue - elif HpoMtcFilter.genotypes_have_same_hpo_proportions(counts_frame): + + elif HpoMtcFilter.genotypes_have_same_hpo_proportions( + counts_frame, + gt_predicate.get_categories(), + ): reason = "Skipping term because all genotypes have same HPO observed proportions" reason_for_filtering_out[reason] += 1 continue - elif HpoMtcFilter.one_genotype_has_zero_hpo_observations(counts_frame): + + elif HpoMtcFilter.one_genotype_has_zero_hpo_observations( + counts_frame, + gt_predicate.get_categories(), + ): reason = "Skipping term because one genotype had zero observations" reason_for_filtering_out[reason] += 1 continue @@ -305,6 +338,7 @@ def filter_terms_to_test( reason = "Child term with same counts previously tested" reason_for_filtering_out[reason] += 1 continue + # if we get here, then we include the test for `term_id` filtered_n_usable[term_id] = n_usable[term_id] filtered_all_counts[term_id] = all_counts[term_id] @@ -312,7 +346,7 @@ def filter_terms_to_test( return filtered_n_usable, filtered_all_counts, reason_for_filtering_out def filter_method_name(self) -> str: - return "hpo mtc" + return "HPO MTC filter" @staticmethod def get_number_of_observed_hpo_observations(counts_frame: pd.DataFrame) -> int: @@ -331,16 +365,21 @@ def get_maximum_group_observed_HPO_frequency(counts_frame: pd.DataFrame) -> floa return df.max() @staticmethod - def one_genotype_has_zero_hpo_observations(counts: pd.DataFrame): + def one_genotype_has_zero_hpo_observations( + counts: pd.DataFrame, + gt_categories: typing.Sequence[PatientCategory], + ): if not isinstance(counts, pd.DataFrame): raise ValueError( f"argument 'counts' must be pandas DataFrame but was {type(counts)}" ) if counts.shape == (2, 2): + assert len(gt_categories) == 2, \ + f"The counts frame is 2x2 but we found {len(gt_categories)} patient categories!" + a, b = gt_categories return ( - counts.loc[:, PatientCategories.YES].sum() == 0 - or counts.loc[:, PatientCategories.NO].sum() == 0 + counts.loc[:, a].sum() == 0 or counts.loc[:, b].sum() == 0 ) elif counts.shape == (2, 3): raise ValueError("(2,3) not yet implemented") @@ -370,6 +409,7 @@ def some_cell_has_greater_than_one_count(counts: pd.DataFrame) -> bool: @staticmethod def genotypes_have_same_hpo_proportions( counts: pd.DataFrame, + gt_categories: typing.Sequence[PatientCategory], delta: float = 0.01, ) -> bool: """ @@ -388,10 +428,13 @@ def genotypes_have_same_hpo_proportions( ) if counts.shape == (2, 2): - num1 = counts.loc[PatientCategories.YES, PatientCategories.NO] - denom1 = counts.loc[:, PatientCategories.NO].sum() - num2 = counts.loc[PatientCategories.YES, PatientCategories.YES] - denom2 = counts.loc[:, PatientCategories.YES].sum() + assert len(gt_categories) == 2, \ + f"The counts frame is 2x2 but we found {len(gt_categories)} patient categories!" + a, b = gt_categories + num1 = counts.loc[PatientCategories.YES, a] + denom1 = counts.loc[:, a].sum() + num2 = counts.loc[PatientCategories.YES, b] + denom2 = counts.loc[:, b].sum() if denom1 == 0 or denom2 == 0: return False return abs(num1 / denom1 - num2 / denom2) < delta diff --git a/src/genophenocorr/analysis/_util.py b/src/genophenocorr/analysis/_util.py new file mode 100644 index 000000000..fecae9155 --- /dev/null +++ b/src/genophenocorr/analysis/_util.py @@ -0,0 +1,60 @@ +import typing + +from collections import Counter + +import hpotk + + +from genophenocorr.model import Patient + + +def prepare_hpo_terms_of_interest( + hpo: hpotk.graph.GraphAware, + patients: typing.Iterable[Patient], + min_n_of_patients_with_term: int, +) -> typing.Collection[hpotk.TermId]: + """ + Prepare a collection of HPO terms to test. + + This includes the direct HPO patient annotations + as well as the ancestors of the present terms and the descendants of the excluded terms. + + :param hpo: an entity with an HPO graph (e.g. :class:`hpotk.MinimalOntology`). + :param patients: an iterable with patients. + :param min_n_of_patients_with_term: the minimum number of patients that must feature an HPO term + (either directly or indirectly) for the term to be included in the analysis. + """ + present_count = Counter() + excluded_count = Counter() + + for patient in patients: + for pf in patient.phenotypes: + if pf.is_present: + # A present phenotypic feature must be counted in. + present_count[pf.identifier] += 1 + # and it also implies presence of its ancestors. + for anc in hpo.graph.get_ancestors(pf): + present_count[anc] += 1 + else: + # An excluded phenotypic feature + excluded_count[pf.identifier] += 1 + for desc in hpo.graph.get_descendants(pf): + # implies exclusion of its descendants. + excluded_count[desc] += 1 + + total_count = Counter() + for term_id, count in present_count.items(): + total_count[term_id] += count + + for term_id, count in excluded_count.items(): + total_count[term_id] += count + + final_hpo = [] + for term_id in present_count: + # Keep the term if it is mentioned at least *n* times (incl. being excluded) + # in the cohort + n_all = total_count[term_id] + if n_all >= min_n_of_patients_with_term: + final_hpo.append(term_id) + + return tuple(final_hpo) diff --git a/src/genophenocorr/analysis/predicate/genotype/_gt_predicates.py b/src/genophenocorr/analysis/predicate/genotype/_gt_predicates.py index 5ce81f8e3..79129b422 100644 --- a/src/genophenocorr/analysis/predicate/genotype/_gt_predicates.py +++ b/src/genophenocorr/analysis/predicate/genotype/_gt_predicates.py @@ -73,7 +73,7 @@ def __init__( self._counters = tuple(counters) self._categorizations = tuple(categorizations) group_names = ', '.join(c.category.name for c in self._categorizations) - self._question = f'What group does the patient belong to: {group_names}' + self._question = f'Genotype group: {group_names}' def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations diff --git a/src/genophenocorr/model/_cohort.py b/src/genophenocorr/model/_cohort.py index d05ffc48b..270ebc034 100644 --- a/src/genophenocorr/model/_cohort.py +++ b/src/genophenocorr/model/_cohort.py @@ -11,29 +11,17 @@ class Patient: - """A class that represents an individual patient - - Attributes: - labels (SampleLabels): The patient identifiers - phenotypes (Sequence[Phenotype]): A list of Phenotype objects - diseases (Sequence[Disease]): A list of Disease objects - variants (Sequence[Variant]): A list of Variant objects + """ + `Patient` represents a single investigated individual. """ def __init__( - self, - labels: SampleLabels, - phenotypes: typing.Iterable[Phenotype], - diseases: typing.Iterable[Disease], - variants: typing.Iterable[Variant] - ): - """Constructs all necessary attributes for a Patient object - - Args: - labels (string): A string unique to this Patient object - phenotypes (Iterable[Phenotype]): A list of Phenotype objects - variants (Iterable[Variant]): A list of Variant objects - """ + self, + labels: SampleLabels, + phenotypes: typing.Iterable[Phenotype], + diseases: typing.Iterable[Disease], + variants: typing.Iterable[Variant] + ): self._labels = labels self._phenotypes = tuple(phenotypes) self._diseases = tuple(diseases) @@ -42,8 +30,7 @@ def __init__( @property def patient_id(self) -> str: """ - Returns: - string: Patient ID unique to this Patient object + Get a unique patient ID. """ return self._labels.label_summary() @@ -57,39 +44,46 @@ def labels(self) -> SampleLabels: @property def phenotypes(self) -> typing.Sequence[Phenotype]: """ - Returns: - Sequence[Phenotype]: A list of Phenotype objects associated with this Patient object + Get the phenotypes observed and excluded in the patient. """ return self._phenotypes @property def diseases(self) -> typing.Sequence[Disease]: + """ + Get the diseases the patient has (not) been diagnosed with. + """ return self._diseases @property def variants(self) -> typing.Sequence[Variant]: """ - Returns: - Sequence[Variant]: A list of Variant objects associated with this Patient object + Get a list of variants observed in the patient. """ return self._variants def present_phenotypes(self) -> typing.Iterator[Phenotype]: """ - Get an iterator over *present* phenotypes of the patient. + Get an iterator over the *present* phenotypes of the patient. """ return filter(lambda p: p.is_present, self._phenotypes) def excluded_phenotypes(self) -> typing.Iterator[Phenotype]: """ - Get an iterator over *excluded* phenotypes of the patient. + Get an iterator over the *excluded* phenotypes of the patient. """ return filter(lambda p: p.is_excluded, self._phenotypes) def present_diseases(self) -> typing.Iterator[Disease]: + """ + Get an iterator with diseases the patient was diagnosed with. + """ return filter(lambda d: d.is_present, self._diseases) def excluded_diseases(self) -> typing.Iterator[Disease]: + """ + Get an iterator with diseases whose presence was excluded in the patient. + """ return filter(lambda d: not d.is_present, self._diseases) def __str__(self) -> str: @@ -123,11 +117,6 @@ def from_patients( ): """ Create a cohort from a sequence of patients. - - Args: - members: an iterable with patients - include_patients_with_no_variants: - include_patients_with_no_HPO: """ # TODO: move this logic into `CohortCreator` and remove `excluded_member_count` from `Cohort`. filtered = set() @@ -148,9 +137,9 @@ def from_patients( ) def __init__( - self, - members: typing.Iterable[Patient], - excluded_member_count: int, + self, + members: typing.Iterable[Patient], + excluded_member_count: int, ): self._patient_set = frozenset(members) self._excluded_count = excluded_member_count @@ -158,62 +147,62 @@ def __init__( @property def all_patients(self) -> typing.Collection[Patient]: """ - Returns: - set: A collection of all the Patient objects in the Cohort + Get a collection of all patients in the cohort. """ return self._patient_set def all_phenotypes(self) -> typing.Set[Phenotype]: """ - Returns: - set: A set of all the Phenotype objects in the Cohort + Get a set of all phenotypes (observed or excluded) in the cohort members. """ return set(itertools.chain(phenotype for patient in self._patient_set for phenotype in patient.phenotypes)) def all_diseases(self) -> typing.Set[Disease]: + """ + Get a set of all diseases (observed or excluded) in the cohort members. + """ return set(itertools.chain(disease for patient in self._patient_set for disease in patient.diseases)) def all_variants(self) -> typing.Set[Variant]: """ - Returns: - set: A set of all the Variant objects in the Cohort + Get a set of all variants observed in the cohort members. """ return set(itertools.chain(variant for patient in self._patient_set for variant in patient.variants)) @property def all_transcript_ids(self) -> typing.Set[str]: """ - Returns: - set: A set of all the transcript IDs in the Cohort + Get a set of all transcript IDs affected by the cohort variants. """ return set(tx.transcript_id for v in self.all_variants() for tx in v.tx_annotations) @property def total_patient_count(self): """ - Returns: - integer: The count of all the Patient objects + Get the total number of cohort members. """ return len(self._patient_set) def get_patient_ids(self) -> typing.Set[str]: """ - Returns: - list: A list of all the patient IDs in the Cohort + Get a set of the patient IDs. """ return set(pat.patient_id for pat in self._patient_set) def list_present_phenotypes( - self, - top=None, + self, + top: typing.Optional[int] = None, ) -> typing.Sequence[typing.Tuple[str, int]]: """ Get a sequence with counts of HPO terms used as direct annotations of the cohort members. + Args: - top (integer, Optional): If not given, lists all present phenotypes. - Otherwise, lists only the `top` highest counts + top typing.Optional[int]: If not given, lists all present phenotypes. + Otherwise, lists only the `top` highest counts + Returns: - list: A list of tuples, formatted (phenotype CURIE, number of patients with that phenotype) + typing.Sequence[typing.Tuple[str, int]]: A sequence of tuples, formatted (phenotype CURIE, + number of patients with that phenotype) """ counter = Counter() for patient in self._patient_set: @@ -235,7 +224,8 @@ def list_all_variants( ) -> typing.Sequence[typing.Tuple[str, int]]: """ Args: - top (integer, Optional): If not given, lists all variants. Otherwise, lists only the `top` highest counts + top typing.Optional[int]: If not given, lists all variants. Otherwise, lists only the `top` highest counts + Returns: list: A sequence of tuples, formatted (variant key, number of patients with that variant) """ @@ -250,7 +240,8 @@ def list_all_proteins( ) -> typing.Sequence[typing.Tuple[str, int]]: """ Args: - top (integer, Optional): If not given, lists all proteins. Otherwise, lists only the `top` highest counts + top typing.Optional[int]: If not given, lists all proteins. Otherwise, lists only the `top` highest counts. + Returns: list: A list of tuples, formatted (protein ID string, the count of variants that affect the protein) """ @@ -260,15 +251,16 @@ def list_all_proteins( return counter.most_common(top) def variant_effect_count_by_tx( - self, - tx_id: typing.Optional[str] = None, + self, + tx_id: typing.Optional[str] = None, ) -> typing.Mapping[str, typing.Mapping[str, int]]: """ Count variant effects for all transcripts or for a transcript `tx_id` of choice. Args: - tx_id (string, Optional): a `str` with transcript accession (e.g. `NM_123456.5`) + tx_id: a `str` with transcript accession (e.g. `NM_123456.5`) or `None` if all transcripts should be listed. + Returns: mapping: Each transcript ID references a Counter(), with the variant effect as the key and the count of variants with that effect on the transcript id. diff --git a/src/genophenocorr/model/_phenotype.py b/src/genophenocorr/model/_phenotype.py index 17ee7938a..def0ec9c4 100644 --- a/src/genophenocorr/model/_phenotype.py +++ b/src/genophenocorr/model/_phenotype.py @@ -9,10 +9,6 @@ class Phenotype(hpotk.model.Identified, hpotk.model.ObservableFeature): `Phenotype` represents a clinical sign or symptom represented as an HPO term. The phenotype can be either present in the patient or excluded. - - Attributes: - term_id (hpotk.TermId): The HPO ID associated with this phenotype - is_observed (bool): Is True if this phenotype was observed in the respective patient """ @staticmethod @@ -29,20 +25,15 @@ def __init__( @property def identifier(self) -> hpotk.TermId: - """Returns an HPO ID unique to this Phenotype object. - You can find more information about the phenotype by searching this ID at https://hpo.jax.org/app/ - - Returns: - hpotk.model.Named: HPO ID + """ + Get the phenotype ID; an HPO term ID most of the time. """ return self._term_id @property def is_present(self) -> bool: """ - Returns: - boolean: `True` if the phenotype feature was observed in the subject or `False` if the feature's presence - was explicitly excluded. + Return `True` if the phenotype feature was observed in the subject or `False` if the feature's presence was explicitly excluded. """ return self._observed @@ -84,45 +75,38 @@ def __repr__(self): class Disease(hpotk.model.Identified, hpotk.model.ObservableFeature, hpotk.model.Named): - """A class that represents a disease - - Attributes: - term_id (hpotk.model.Named): The ID given by the user to reference this disease - name (str): The name given by the user for this disease - is_observed (bool): Is True if this disease was observed in the respective patient + """ + Representation of a disease diagnosed (or excluded) in an investigated individual. """ - def __init__(self, term_id: hpotk.TermId, - name: str, - is_observed: bool) -> None: + def __init__( + self, + term_id: hpotk.TermId, + name: str, + is_observed: bool, + ): self._term_id = hpotk.util.validate_instance(term_id, hpotk.TermId, 'term_id') self._name = hpotk.util.validate_instance(name, str, 'name') self._observed = hpotk.util.validate_instance(is_observed, bool, 'is_observed') @property def identifier(self) -> hpotk.TermId: - """Returns an ID unique to this Disease object. - - Returns: - hpotk.model.Named: Disease ID + """ + Get the disease ID. """ return self._term_id @property def name(self): - """Returns a string that describes this Disease object. - - Returns: - string: disease name + """ + Get the disease label (e.g. `LEIGH SYNDROME, NUCLEAR; NULS`). """ return self._name @property def is_present(self) -> bool: """ - Returns: - boolean: `True` if the disease was observed in the subject or `False` if the disease's presence - was explicitly excluded. + Return `True` if the disease was diagnosed in the individual or `False` if it was excluded. """ return self._observed @@ -137,8 +121,8 @@ def __hash__(self): def __str__(self): return f"Disease(" \ - f"identifier={self.identifier}, " \ - f"name={self.name}, " \ + f"identifier={self._term_id}, " \ + f"name={self._name}, " \ f"is_present={self._observed})" def __repr__(self): diff --git a/src/genophenocorr/model/_protein.py b/src/genophenocorr/model/_protein.py index e3689f032..c4a17baea 100644 --- a/src/genophenocorr/model/_protein.py +++ b/src/genophenocorr/model/_protein.py @@ -7,15 +7,11 @@ from .genome import Region - class FeatureInfo: - """A class that represents a protein feature - (e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11") - - Attributes: - name (string): The given name or description of the protein feature - region (Region): The protein feature region coordinates """ + `FeatureInfo` represents a protein feature + (e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11") +""" def __init__(self, name: str, region: Region): self._name = hpotk.util.validate_instance(name, str, 'name') @@ -72,18 +68,29 @@ def __repr__(self) -> str: class FeatureType(enum.Enum): - """An enum class defining available feature types that can be found in the protein sequence. - - Attributes: - REPEAT: a repeated sequence motif or repeated domain within the protein - MOTIF: a short (usually not more than 20 amino acids) conserved sequence motif of biological significance - DOMAIN: a specific combination of secondary structures organized into a characteristic three-dimensional structure or fold - REGION: a region of interest that cannot be described in other subsections """ + An enum representing the protein feature types supported in GPSEA. + """ + REPEAT = enum.auto() + """ + A repeated sequence motif or repeated domain within the protein. + """ + MOTIF = enum.auto() + """ + A short (usually not more than 20 amino acids) conserved sequence motif of biological significance. + """ + DOMAIN = enum.auto() + """ + A specific combination of secondary structures organized into a characteristic three-dimensional structure or fold. + """ + REGION = enum.auto() + """ + A region of interest that cannot be described in other subsections. + """ class ProteinFeature(metaclass=abc.ABCMeta): @@ -107,11 +114,8 @@ def to_string(self) -> str: class SimpleProteinFeature(ProteinFeature): - """A class that represents a protein feature - - Attributes: - info (FeatureInfo): A FeatureInfo object, describing name and location of the feature - feature_type (FeatureType): A FeatureType object, limited to specific feature types + """ + An implementation of a `ProteinFeature`. """ # Not part of the public API. @@ -146,46 +150,34 @@ def feature_type(self) -> FeatureType: return self._type def __str__(self) -> str: - return f"SimpleProteinFeature(type={self.feature_type}, " \ - f"info={str(self.info)})" + return f"SimpleProteinFeature(type={self._type}, " \ + f"info={self._info})" def __repr__(self) -> str: return str(self) def __eq__(self, other) -> bool: - return isinstance(other, ProteinFeature) \ - and self.feature_type == other.feature_type \ - and self.info == other.info + return isinstance(other, SimpleProteinFeature) \ + and self._type == other._type \ + and self._info == other._info def __hash__(self) -> int: - return hash((self.feature_type, self.info)) + return hash((self._type, self._info)) class ProteinMetadata: - """A class that represents a specific protein - - Attributes: - protein_id (string): A string unique to this protein - label (string): Full name of the protein - protein_features (Sequence[ProteinFeature]): A sequence of ProteinFeature objects - length(int): The length of the Protein (chain) in amino-acid residues - # TODO -- provided with default argument of zero to not break tests. Refactor tests. - Methods: - domains (Iterable[ProteinFeature]): A subgroup of protein_features, where the ProteinFeature object has a FeatureType equal to "DOMAIN" - repeats (Iterable[ProteinFeature]): A subgroup of protein_features, where the ProteinFeature object has a FeatureType equal to "REPEAT" - regions (Iterable[ProteinFeature]): A subgroup of protein_features, where the ProteinFeature object has a FeatureType equal to "REGION" - motifs (Iterable[ProteinFeature]): A subgroup of protein_features, where the ProteinFeature object has a FeatureType equal to "MOTIF" + """ + An info regarding a protein sequence, including an ID, a label, + and location of protein features, such as motifs, domains, or other regions. """ - def __init__(self, protein_id: str, label: str, protein_features: typing.Sequence[ProteinFeature], protein_length:int=0): - """Constructs all necessary attributes for a ProteinMetadata object - - Args: - protein_id (string): A string unique to this protein - label (string): Full name of the protein - protein_features (Sequence[ProteinFeature]): A sequence of ProteinFeature objects - protein_length (int): The length of the Protein (chain) in amino-acid residues - """ + def __init__( + self, + protein_id: str, + label: str, + protein_features: typing.Sequence[ProteinFeature], + protein_length: int = 0, + ): if not isinstance(protein_id, str): raise ValueError(f"Protein ID must be type string but is type {type(protein_id)}") self._id = protein_id diff --git a/src/genophenocorr/model/_variant.py b/src/genophenocorr/model/_variant.py index c33e1ba13..c5b13333a 100644 --- a/src/genophenocorr/model/_variant.py +++ b/src/genophenocorr/model/_variant.py @@ -14,7 +14,6 @@ class TranscriptInfoAware(metaclass=abc.ABCMeta): The implementors know about basic gene/transcript identifiers. """ - @property @abc.abstractmethod def gene_id(self) -> str: @@ -35,19 +34,9 @@ def transcript_id(self) -> str: class TranscriptAnnotation(TranscriptInfoAware): - """Class that represents results of the functional annotation of a variant with respect to single transcript of a gene. - - Args: - gene_id (string): The gene symbol associated with the transcript - tx_id (string): The transcript ID - hgvsc (string): The HGVS "coding-DNA" ID if available, else None - is_preferred (bool): The transcript is a MANE transcript, canonical Ensembl transcript, etc. - variant_effects (Iterable[string]): An iterable of predicted effects given by VEP - affected_exons (Iterable[integer]): An iterable of exons affected by the variant. Returns None if none are affected. - protein_id (string): The protein ID for the protein encoded by the transcript. - hgvsp (Optional[str]): Predicted effect on the encoded protein or `None` if not available`. - protein_effect_coordinates (Region, optional): An optional :class:`Region` with start and end coordinates - of the effect on the protein sequence. + """ + `TranscriptAnnotation` represent a result of the functional annotation of a variant + with respect to single transcript of a gene. """ def __init__( @@ -79,22 +68,21 @@ def __init__( self._hgvsp = hpotk.util.validate_instance(hgvsp, str, 'hgvsp') else: self._hgvsp = None - self._protein_effect_location = hpotk.util.validate_optional_instance(protein_effect_coordinates, Region, - 'protein_effect_coordinates') + self._protein_effect_location = hpotk.util.validate_optional_instance( + protein_effect_coordinates, Region, 'protein_effect_coordinates', + ) @property def gene_id(self) -> str: """ - Returns: - string: The gene symbol (e.g. SURF1) + Get the HGVS symbol of the affected gene (e.g. *SURF1*). """ return self._gene_id @property def transcript_id(self) -> str: """ - Returns: - string: The transcript RefSeq identifier (e.g. NM_123456.7) + Get the RefSeq identifier of the transcript (e.g. `NM_123456.7`). """ return self._tx_id @@ -109,52 +97,47 @@ def is_preferred(self) -> bool: @property def hgvs_cdna(self) -> typing.Optional[str]: """ - Returns: - string: The HGVS "coding-DNA" representation of the variant (e.g. NM_123456.7:c.9876G>T) - or `None` if not available. + Get the HGVS description of the sequence variant (e.g. ``NM_123456.7:c.9876G>T``) + or `None` if not available. """ return self._hgvs_cdna @property def variant_effects(self) -> typing.Sequence[VariantEffect]: """ - Returns: - Sequence[string]: A sequence of variant effects. - Definitions of these can be found at: http://www.sequenceontology.org/ + Get a sequence of the predicted functional variant effects. """ return self._variant_effects @property def overlapping_exons(self) -> typing.Optional[typing.Sequence[int]]: """ - Returns: - Sequence[integer]: A sequence of 1-based exon indices (the index of the 1st exon is `1`) - that overlap with the variant. + Get a sequence of 1-based exon indices (the index of the 1st exon is `1`) + that overlap with the variant. """ return self._affected_exons @property def protein_id(self) -> typing.Optional[str]: """ - Returns: - Optional[str]: The protein accession ID for the protein relevant to the variant + Get the ID of the protein encoded by the :attr:`transcript_id` + or `None` if the transcript is not protein-coding. """ return self._protein_id @property def hgvsp(self) -> typing.Optional[str]: """ - Returns: - Optional[str]: The HGVS protein sequence name + Get the HGVS description of the protein sequence variant (e.g. ``NP_001027559.1:p.Gln421Ter``) + or `None` if not available. """ return self._hgvsp @property def protein_effect_location(self) -> typing.Optional[Region]: """ - Returns: - Region: a :class:`genophenocorr.model.genome.Region` with start and end position on the protein sequence - that the variant affects. + Get the :class:`~genophenocorr.model.genome.Region` with start and end amino-acid coordinates + affected by the variant. """ return self._protein_effect_location @@ -202,9 +185,9 @@ def __hash__(self) -> int: class VariantClass(enum.Enum): """ - `VariantClass` represents a high-level variant category + `VariantClass` represents a high-level variant category which mostly corresponds to the structural variant categories - of the Variant Call Format specification, + of the Variant Call Format specification, but includes type for single nucleotide variants (SNV) and multi-nucleotide variant (MNV). """ @@ -249,14 +232,10 @@ class VariantClass(enum.Enum): class VariantCoordinates: - """A representation of coordinates of sequence and symbolic variants. - The breakend variants are not supported. - - Attributes: - region (GenomicRegion): The region spanned by the variant reference allele - ref (string): The reference allele - alt (string): The alternate allele - change_length (integer): The change between the ref and alt alleles due to the variant presence + """ + A representation of coordinates of sequence and symbolic variants. + + Note, the breakend variants are not currently supported. """ @staticmethod @@ -398,50 +377,46 @@ def __init__(self, region: GenomicRegion, ref: str, alt: str, change_length: int @property def chrom(self) -> str: """ - Returns: - string: The label of the chromosome/contig where the variant is located. + Get the label of the chromosome/contig where the variant is located. """ return self._region.contig.name @property def start(self) -> int: """ - Returns: - integer: The 0-based start coordinate (excluded) of the ref allele. + Get the 0-based start coordinate (excluded) of the first base of the :attr:`ref` allele. """ return self._region.start @property def end(self) -> int: """ - Returns: - integer: The 0-based end coordinate (included) of the ref allele. + Get the 0-based end coordinate (included) of the last base of the :attr:`ref` allele. """ return self._region.end @property def region(self) -> GenomicRegion: """ - Returns: - GenomicRegion: The genomic region spanned by the ref allele. + Get the genomic region spanned by the :attr:`ref` allele. """ return self._region @property def ref(self) -> str: """ - Returns: - string: The reference allele (e.g. "A", "N"). The allele may be an empty string. + Get the reference allele (e.g. "A", "CCT", "N"). The allele may be an empty string. """ return self._ref @property def alt(self) -> str: """ - Returns: - string: The alternate allele (e.g. "A", "GG", ""). The allele may be an empty string for sequence variants. - The symbolic alternate allele follow the VCF notation and use the `<` and `>` characters - (e.g. "", ""). + Get the alternate allele (e.g. "A", "GG", ""). + + The allele may be an empty string for sequence variants. + The symbolic alternate allele follow the VCF notation and use the `<` and `>` characters + (e.g. "", ""). """ return self._alt @@ -568,7 +543,7 @@ def __init__( @property def structural_type(self) -> hpotk.TermId: """ - Get term ID of the structural type (e.g. `SO:1000029` for chromosomal deletion). + Get term ID of the structural type (e.g. ``SO:1000029`` for chromosomal deletion). """ return self._structural_type @@ -582,7 +557,7 @@ def variant_class(self) -> VariantClass: @property def gene_id(self) -> str: """ - Get a `str` with gene identifier CURIE (e.g. `HGNC:3603`) or `None` if the identifier is not available. + Get a `str` with gene identifier CURIE (e.g. ``HGNC:3603``) or `None` if the identifier is not available. """ return self._gene_id diff --git a/src/genophenocorr/preprocessing/_api.py b/src/genophenocorr/preprocessing/_api.py index 29b9b635a..8a1a1cfb9 100644 --- a/src/genophenocorr/preprocessing/_api.py +++ b/src/genophenocorr/preprocessing/_api.py @@ -74,7 +74,8 @@ class TranscriptCoordinateService(metaclass=abc.ABCMeta): @abc.abstractmethod def fetch( - self, tx: typing.Union[str, TranscriptInfoAware] + self, + tx: typing.Union[str, TranscriptInfoAware], ) -> TranscriptCoordinates: """ Get tx coordinates for a tx ID or an entity that knows about the tx ID. @@ -83,7 +84,7 @@ def fetch( Args: tx: a `str` with tx ID (e.g. `NM_002834.5`) or an entity that knows about the transcript ID - (e.g. :class:`genophenocorr.model.TranscriptAnnotation`). + (e.g. :class:`genophenocorr.model.TranscriptAnnotation`). Returns: the transcript coordinates. """ @@ -105,7 +106,8 @@ def fetch_for_gene(self, gene: str) -> typing.Sequence[TranscriptCoordinates]: Args: gene: a `str` with tx ID (e.g. `HGNC:3603`) - Returns: a sequence of transcript coordinates for the gene. + Returns: + typing.Sequence[TranscriptCoordinates]: a sequence of transcript coordinates for the gene. """ pass @@ -223,5 +225,3 @@ def summarize( else: file.write("No errors or warnings were found") file.write(os.linesep) - l_pad = " " * (self._notepad.level * indent) - file.write(os.linesep) diff --git a/src/genophenocorr/preprocessing/_config.py b/src/genophenocorr/preprocessing/_config.py index bd4069f6a..12eef36cf 100644 --- a/src/genophenocorr/preprocessing/_config.py +++ b/src/genophenocorr/preprocessing/_config.py @@ -154,10 +154,6 @@ def configure_patient_creator( timeout: float = 30., ) -> PhenopacketPatientCreator: # Rename to something more understandable by user """ - ^^^ none, lenient, strict - - none = run unless unrunnable - lenient = fix what we can, abort unfixable - strict = abort at any issue A convenience function for configuring a non-caching :class:`genophenocorr.preprocessing.PhenopacketPatientCreator`. To create the patient creator, we need hpo-toolkit's representation of HPO. Other options are optional @@ -246,10 +242,10 @@ def _setup_phenotype_creator(hpo: hpotk.MinimalOntology, def _configure_functional_annotator( - cache_dir: str, - variant_fallback: str, - timeout: float, - ) -> FunctionalAnnotator: + cache_dir: str, + variant_fallback: str, + timeout: float, +) -> FunctionalAnnotator: # (2) FunctionalAnnotator # Setup fallback @@ -265,9 +261,9 @@ def _configure_functional_annotator( def _configure_fallback_functional( - variant_fallback: str, - timeout: float, - ) -> FunctionalAnnotator: + variant_fallback: str, + timeout: float, +) -> FunctionalAnnotator: if variant_fallback == 'VEP': fallback = VepFunctionalAnnotator(timeout=timeout) else: diff --git a/src/genophenocorr/preprocessing/_phenopacket.py b/src/genophenocorr/preprocessing/_phenopacket.py index 12bba8457..be24228d4 100644 --- a/src/genophenocorr/preprocessing/_phenopacket.py +++ b/src/genophenocorr/preprocessing/_phenopacket.py @@ -56,10 +56,10 @@ class PhenopacketVariantCoordinateFinder( VariantCoordinateFinder[GenomicInterpretation] ): """ - `PhenopacketVariantCoordinateFinder` figures out :class:`genophenocorr.model.VariantCoordinates` - and :class:`genophenocorr.model.Genotype` from `GenomicInterpretation` element of Phenopacket Schema. + `PhenopacketVariantCoordinateFinder` figures out :class:`~genophenocorr.model.VariantCoordinates` + and :class:`~genophenocorr.model.Genotype` from `GenomicInterpretation` element of Phenopacket Schema. - :param build: genome build to use in `VariantCoordinates + :param build: genome build to use in `VariantCoordinates` :param hgvs_coordinate_finder: the coordinate finder to use for parsing HGVS expressions """ @@ -85,8 +85,9 @@ def find_coordinates( Args: item (GenomicInterpretation): a genomic interpretation element from Phenopacket Schema + Returns: - VariantCoordinates: variant coordinates + typing.Optional[VariantCoordinates]: variant coordinates """ if not isinstance(item, GenomicInterpretation): raise ValueError( @@ -233,12 +234,12 @@ def __init__( # Set of sequence ontology IDs that we will treat as a deletion (`DEL`) # for the purpose of assigning imprecise SV info with a variant class. self._so_deletions = { - '1000029', # chromosomal deletion: An incomplete chromosome. - '0001893', # transcript ablation: A feature ablation whereby the deleted region includes a transcript feature. - '0001879', # feature_ablation: A sequence variant, caused by an alteration of the genomic sequence, where the deletion, is greater than the extent of the underlying genomic features. + '1000029', # chromosomal deletion: An incomplete chromosome. + '0001893', # transcript ablation: A feature ablation whereby the deleted region includes a transcript feature. + '0001879', # feature_ablation: A sequence variant, caused by an alteration of the genomic sequence, where the deletion, is greater than the extent of the underlying genomic features. } self._so_duplications = { - '1000037', # chromosomal_duplication + '1000037', # chromosomal_duplication } def process(self, inputs: Phenopacket, notepad: Notepad) -> Patient: diff --git a/src/genophenocorr/preprocessing/_vv.py b/src/genophenocorr/preprocessing/_vv.py index 777fb1e38..6242ea754 100644 --- a/src/genophenocorr/preprocessing/_vv.py +++ b/src/genophenocorr/preprocessing/_vv.py @@ -136,7 +136,7 @@ class VariantValidatorDecodeException(BaseException): class VVMultiCoordinateService(TranscriptCoordinateService, GeneCoordinateService): """ - `VVMultiCoordinateService` uses the Variant Validator REST API to fetch transcript coordinates for + `VVMultiCoordinateService` uses the Variant Validator REST API to fetch transcript coordinates for both a *gene* ID and a specific *transcript* ID. :param genome_build: the genome build for constructing the transcript coordinates. @@ -370,7 +370,7 @@ def _parse_cds(coding_start: int, coding_end: int, exons: typing.Iterable[Genomi processed += exon_len - raise ValueError(f'Could not parse CDS start and end from given coordinates') + raise ValueError('Could not parse CDS start and end from given coordinates') @staticmethod def _parse_is_preferred( @@ -379,7 +379,7 @@ def _parse_is_preferred( if 'annotations' in tx_data: annotations = tx_data['annotations'] if 'mane_select' in annotations: - assert type(annotations['mane_select']) == bool, '\'mane_select\' field must be `bool`' + assert isinstance(annotations['mane_select'], bool), '\'mane_select\' field must be `bool`' return annotations['mane_select'] return None diff --git a/src/genophenocorr/view/__init__.py b/src/genophenocorr/view/__init__.py index c29b2c564..6eb13dd61 100644 --- a/src/genophenocorr/view/__init__.py +++ b/src/genophenocorr/view/__init__.py @@ -2,7 +2,7 @@ from ._disease import DiseaseViewable from ._protein_viewer import ProteinViewable from ._protein_visualizable import ProteinVisualizable -from ._stats import StatsViewer +from ._stats import MtcStatsViewer from ._txp import VariantTranscriptVisualizer from ._protein_visualizer import ProteinVisualizer from ._formatter import VariantFormatter @@ -11,7 +11,7 @@ 'CohortViewable', 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', - 'StatsViewer', + 'MtcStatsViewer', 'VariantTranscriptVisualizer', 'VariantFormatter', ] diff --git a/src/genophenocorr/view/_protein_visualizer.py b/src/genophenocorr/view/_protein_visualizer.py index e805221d9..7dfa1c452 100644 --- a/src/genophenocorr/view/_protein_visualizer.py +++ b/src/genophenocorr/view/_protein_visualizer.py @@ -9,7 +9,7 @@ import matplotlib.colors as mcolors import numpy as np -from genophenocorr.model import VariantEffect +from genophenocorr.model import Cohort, ProteinMetadata, TranscriptCoordinates, VariantEffect from ._protein_visualizable import ProteinVisualizable @@ -53,11 +53,22 @@ def __init__(self, random_seed: int = 42) -> None: self.legend2_max_x = 0.3 self.legend2_max_y = 0.75 + def draw_protein_diagram( + self, + tx_coordinates: TranscriptCoordinates, + protein_metadata: ProteinMetadata, + cohort: Cohort, + ax: typing.Optional[plt.Axes] = None, + labeling_method: typing.Literal['abbreviate', 'enumerate'] = 'abbreviate' + ) -> typing.Optional[plt.Axes]: + pvis = ProteinVisualizable(tx_coordinates, protein_metadata, cohort) + return self.draw_fig(pvis, ax, labeling_method) + def draw_fig( - self, - pvis: ProteinVisualizable, - ax: typing.Optional[plt.Axes] = None, - labeling_method: typing.Literal['abbreviate', 'enumerate'] = 'abbreviate' + self, + pvis: ProteinVisualizable, + ax: typing.Optional[plt.Axes] = None, + labeling_method: typing.Literal['abbreviate', 'enumerate'] = 'abbreviate' ) -> typing.Optional[plt.Axes]: """ Visualize the cohort variants on a protein diagram. diff --git a/src/genophenocorr/view/_stats.py b/src/genophenocorr/view/_stats.py index 22fd75d95..dd17c6fd3 100644 --- a/src/genophenocorr/view/_stats.py +++ b/src/genophenocorr/view/_stats.py @@ -1,15 +1,12 @@ import typing -from hpotk import MinimalOntology from jinja2 import Environment, PackageLoader from genophenocorr.analysis import HpoMtcReport -from genophenocorr.model import Cohort - -class StatsViewer: +class MtcStatsViewer: """ - `StatsViewer` uses a Jinja2 template to create an HTML element for showing in the Jupyter notebook + `MtcStatsViewer` uses a Jinja2 template to create an HTML element for showing in the Jupyter notebook or for writing into a standalone HTML file. """ @@ -18,14 +15,14 @@ def __init__(self): self._cohort_template = environment.get_template("stats.html") def process( - self, - hpo_mtc_report: HpoMtcReport, + self, + hpo_mtc_report: HpoMtcReport, ) -> str: """ - Create an HTML that should be shown with `display(HTML(..))` of the ipython package. + Create an HTML that should be shown with `display(HTML(..))` of the IPython package. Args: - hpo_mtc_report(HpoMtcReport): summary of heuristic term filtering procedure + hpo_mtc_report (HpoMtcReport): summary of heuristic term filtering procedure Returns: str: an HTML string with parameterized template for rendering or writing into a standalone HTML file. @@ -35,7 +32,7 @@ def process( @staticmethod def _prepare_context( - hpo_mtc_report: HpoMtcReport, + hpo_mtc_report: HpoMtcReport, ) -> typing.Mapping[str, typing.Any]: results_map = hpo_mtc_report.skipped_terms_dict if not isinstance(results_map, dict): @@ -50,7 +47,7 @@ def _prepare_context( reason_to_count.append({"reason": reason, "count": count}) n_skipped += count - n_tested = hpo_mtc_report.n_terms_tested - n_skipped + n_tested = hpo_mtc_report.n_terms_before_filtering - n_skipped # The following dictionary is used by the Jinja2 HTML template return { @@ -58,6 +55,6 @@ def _prepare_context( "hpo_mtc_filter_name": hpo_mtc_report.filter_method, "skipped_hpo_count": n_skipped, "tested_hpo_count": n_tested, - "total_hpo_count": hpo_mtc_report.n_terms_tested, + "total_hpo_count": hpo_mtc_report.n_terms_before_filtering, "reason_to_count": reason_to_count, } diff --git a/src/genophenocorr/view/templates/cohort.html b/src/genophenocorr/view/templates/cohort.html index f8ef9379f..d6a3dc866 100644 --- a/src/genophenocorr/view/templates/cohort.html +++ b/src/genophenocorr/view/templates/cohort.html @@ -64,7 +64,7 @@

genophenocorr cohort analysis

-

Successfully ingested {{ n_individuals }} phenopackets.

+

Successfully ingested {{ n_individuals }} individuals.

{% if n_excluded > 0 %}

Not able to ingest {{ n_excluded }} individuals.

{% else %} @@ -80,7 +80,7 @@

Top {{top_hpo_count}} HPO Terms

HPO Term ID - Annotation Count + Seen in n individuals {% for hpo_count in hpo_counts %} @@ -99,20 +99,20 @@

Top {{top_var_count}} Variants

- Variant + Count + Variant key Variant Name Protein Variant Variant Class - Variant Count {% for var_count in var_counts %} + {{ var_count.Count }} {{ var_count.variant }} {{ var_count.variant_name }} {{ var_count.protein_name }} {{ var_count.variant_effects }} - {{ var_count.Count }} - + {% endfor %} diff --git a/src/genophenocorr/view/templates/stats.html b/src/genophenocorr/view/templates/stats.html index c95fe7a39..2a75c055a 100644 --- a/src/genophenocorr/view/templates/stats.html +++ b/src/genophenocorr/view/templates/stats.html @@ -45,17 +45,22 @@ -

Statistical analysis: {{mtc_name}} ({{ hpo_mtc_filter_name }})

+

Phenotype testing report

+

Phenotype MTC filter: {{ hpo_mtc_filter_name }}

+

Multiple testing correction: {{ mtc_name }}

Performed statistical tests for {{ tested_hpo_count }} out of the total of {{ total_hpo_count }} HPO terms.

- + + {% for skipped in reason_to_count %} + + diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index e626ae0e9..defa998ab 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -34,7 +34,7 @@ def test_get_question( predicate: GenotypePolyPredicate, ): question = predicate.get_question() - assert question == "What group does the patient belong to: Point, LoF" + assert question == "Genotype group: Point, LoF" def test_get_categorizations( self, diff --git a/tests/analysis/test_config.py b/tests/analysis/test_config.py index b66224175..d730da892 100644 --- a/tests/analysis/test_config.py +++ b/tests/analysis/test_config.py @@ -8,7 +8,6 @@ class TestCohortAnalysisConfiguration: def test_default_values(self): config = CohortAnalysisConfiguration() - assert config.missing_implies_excluded is False assert config.pval_correction == 'bonferroni' assert config.min_patients_w_hpo is None assert config.include_sv is False diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index e1899fc6c..c95ebec48 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -40,6 +40,23 @@ def patient_counts( ) return n_usable, all_counts + @pytest.fixture(scope='class') + def gt_categories(self) -> pd.Index: + return pd.Index([PatientCategories.YES, PatientCategories.NO]) + + @pytest.fixture(scope='class') + def pheno_categories(self) -> pd.Index: + return pd.Index([PatientCategories.YES, PatientCategories.NO]) + + @staticmethod + def prepare_counts_df( + counts, + index: pd.Index, + columns: pd.Index, + ): + values = np.array(counts).reshape((2, 2)) + return pd.DataFrame(data=values, index=index, columns=columns) + @pytest.mark.parametrize( "counts, expected", [ @@ -53,10 +70,15 @@ def test_one_genotype_has_zero_hpo_observations( self, counts: typing.Tuple[int], expected: bool, + gt_categories: pd.Index, + pheno_categories: pd.Index, ): - counts_df = self.prepare_counts_df(counts) + counts_df = TestHeuristicSamplerMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories) - actual = HpoMtcFilter.one_genotype_has_zero_hpo_observations(counts=counts_df) + actual = HpoMtcFilter.one_genotype_has_zero_hpo_observations( + counts=counts_df, + gt_categories=gt_categories, + ) assert actual == expected @@ -77,8 +99,10 @@ def test_some_cell_has_greater_than_one_count( self, counts: typing.Tuple[int], expected: bool, + gt_categories: pd.Index, + pheno_categories: pd.Index, ): - counts_df = self.prepare_counts_df(counts) + counts_df = TestHeuristicSamplerMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories) actual = HpoMtcFilter.some_cell_has_greater_than_one_count(counts=counts_df) @@ -96,23 +120,33 @@ def test_genotypes_have_same_hpo_proportions( self, counts: typing.Tuple[int], expected: bool, + gt_categories: pd.Index, + pheno_categories: pd.Index, ): - counts_df = self.prepare_counts_df(counts) + counts_df = TestHeuristicSamplerMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories) - actual = HpoMtcFilter.genotypes_have_same_hpo_proportions(counts=counts_df) + actual = HpoMtcFilter.genotypes_have_same_hpo_proportions( + counts=counts_df, + gt_categories=gt_categories, + ) assert actual == expected def test_filter_terms_to_test( self, mtc_filter: HpoMtcFilter, + suox_gt_predicate: GenotypePolyPredicate, patient_counts: typing.Tuple[ typing.Mapping[hpotk.TermId, int], typing.Mapping[hpotk.TermId, pd.DataFrame], ], ): n_usable, all_counts = patient_counts - mtc_report = mtc_filter.filter_terms_to_test(n_usable, all_counts) + mtc_report = mtc_filter.filter_terms_to_test( + suox_gt_predicate, + n_usable, + all_counts, + ) assert isinstance(mtc_report, tuple) assert len(mtc_report) == 3 @@ -130,6 +164,7 @@ def test_filter_terms_to_test( def test_specified_term_mtc_filter( self, hpo: hpotk.MinimalOntology, + suox_gt_predicate: GenotypePolyPredicate, patient_counts: typing.Tuple[ typing.Mapping[hpotk.TermId, int], typing.Mapping[hpotk.TermId, pd.DataFrame], @@ -143,7 +178,11 @@ def test_specified_term_mtc_filter( """ specified_filter = SpecifiedTermsMtcFilter(hpo=hpo, terms_to_test={hpotk.TermId.from_curie("HP:0032350")}) n_usable, all_counts = patient_counts - mtc_report = specified_filter.filter_terms_to_test(n_usable, all_counts) + mtc_report = specified_filter.filter_terms_to_test( + suox_gt_predicate, + n_usable, + all_counts, + ) assert isinstance(mtc_report, tuple) assert len(mtc_report) == 3 # # Skipping non-specified term (n=5) @@ -152,14 +191,6 @@ def test_specified_term_mtc_filter( assert len(filtered_n_usable) == 1 assert reason_for_filtering_out['Skipping non-specified term'] == 4 - @staticmethod - def prepare_counts_df(counts): - index = pd.Index([PatientCategories.YES, PatientCategories.NO]) - columns = pd.Index([PatientCategories.YES, PatientCategories.NO]) - values = np.array(counts).reshape((2, 2)) - - return pd.DataFrame(data=values, index=index, columns=columns) - def test_min_observed_HPO_threshold( self, patient_counts: typing.Tuple[ diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py index 2ce006940..939d29353 100644 --- a/tests/view/test_stats.py +++ b/tests/view/test_stats.py @@ -1,19 +1,19 @@ import pytest from genophenocorr.analysis import HpoMtcReport -from genophenocorr.view import StatsViewer +from genophenocorr.view import MtcStatsViewer class TestStatsViewable: @pytest.fixture - def stats_viewer(self) -> StatsViewer: - return StatsViewer() + def stats_viewer(self) -> MtcStatsViewer: + return MtcStatsViewer() @pytest.mark.skip('Until we design a more reasonable test') def test_process( - self, - stats_viewer: StatsViewer, + self, + stats_viewer: MtcStatsViewer, ): mtc_report = HpoMtcReport( filter_name='identity filter', @@ -25,7 +25,7 @@ def test_process( 'Life is a conspiracy': 80, 'I need coffee': 7, }, - term_count=100, # The filtered out (80 + 7 + 5) + the unfiltered + n_terms_before_filtering=100, # The filtered out (80 + 7 + 5) + the unfiltered ) report = stats_viewer.process(hpo_mtc_report=mtc_report)
Using {{ hpo_mtc_filter_name }}, {{ skipped_hpo_count }} term(s) were omitted from statistical analysis. Using {{ hpo_mtc_filter_name }}, {{ skipped_hpo_count }} term(s) were omitted from statistical analysis.
Code Reason Count
TODO {{ skipped.reason }} {{ skipped.count }}