diff --git a/.flake8 b/.flake8
index 4124b2b6..fed4715d 100644
--- a/.flake8
+++ b/.flake8
@@ -3,7 +3,8 @@ max-line-length = 120
# F405 - ignore error if class is imported in `*` import.
# W293 - blank line contains whitespace
-ignore = F405,W293
+# W503 - line break before binary operator
+ignore = F405,W293,W503
exclude =
.git,
diff --git a/docs/img/phenotype_group_counts.png b/docs/img/phenotype_group_counts.png
deleted file mode 100644
index e08a430a..00000000
Binary files a/docs/img/phenotype_group_counts.png and /dev/null differ
diff --git a/docs/img/rere_phenotype_score_boxplot.png b/docs/img/rere_phenotype_score_boxplot.png
new file mode 100644
index 00000000..684e57ae
Binary files /dev/null and b/docs/img/rere_phenotype_score_boxplot.png differ
diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv
index db0572a0..e4a80ba2 100644
--- a/docs/report/tbx5_frameshift_vs_missense.csv
+++ b/docs/report/tbx5_frameshift_vs_missense.csv
@@ -1,18 +1,261 @@
"Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,,
-,Count,Percent,Count,Percent,p value,Corrected p value
-Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,5.6190936213143254e-05,0.0008990549794102921
-Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.00043478260869565214,0.003478260869565217
-Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.0036231884057971015,0.014492753623188406
-Heart block [HP:0012722],0/22,0%,2/2,100%,0.0036231884057971015,0.014492753623188406
-Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.005628510156750059,0.018011232501600187
-Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.01349527665317139,0.03598740440845704
-Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.02560162393963452,0.058517997576307476
-Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.07440639019586388,0.14881278039172777
-Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.1424522583078588,0.2532484592139712
-Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.1687456462342971,0.2699930339748754
-Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.42857142857142855,0.6233766233766234
-Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.5714285714285713,0.7619047619047618
-Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,0.7735491022101784,0.9520604334894502
-Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0
-Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0
+,Count,Percent,Count,Percent,Corrected p values,p values
+Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0008990549794102921,5.6190936213143254e-05
+Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003478260869565217,0.00043478260869565214
+Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.014492753623188406,0.0036231884057971015
+Heart block [HP:0012722],0/22,0%,2/2,100%,0.014492753623188406,0.0036231884057971015
+Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.018011232501600187,0.005628510156750059
+Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.03598740440845704,0.01349527665317139
+Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.058517997576307476,0.02560162393963452
+Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.14881278039172777,0.07440639019586388
+Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2532484592139712,0.1424522583078588
+Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2699930339748754,0.1687456462342971
+Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6233766233766234,0.42857142857142855
+Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.7619047619047618,0.5714285714285713
+Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,0.9520604334894502,0.7735491022101784
Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0
+Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0
+Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0
+Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,,
+Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,,
+Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,,
+Abnormal cardiovascular system morphology [HP:0030680],63/63,100%,30/30,100%,,
+Abnormality of the cardiovascular system [HP:0001626],65/65,100%,32/32,100%,,
+Phenotypic abnormality [HP:0000118],82/82,100%,38/38,100%,,
+All [HP:0000001],82/82,100%,38/38,100%,,
+Abnormal cardiac atrium morphology [HP:0005120],43/43,100%,20/20,100%,,
+Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,,
+Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,,
+Perimembranous ventricular septal defect [HP:0011682],3/59,5%,3/25,12%,,
+Short thumb [HP:0009778],11/41,27%,8/30,27%,,
+Aplasia/Hypoplasia of the thumb [HP:0009601],20/20,100%,19/19,100%,,
+Aplasia/Hypoplasia of fingers [HP:0006265],22/22,100%,19/19,100%,,
+Aplasia/hypoplasia involving bones of the hand [HP:0005927],22/22,100%,19/19,100%,,
+Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],55/55,100%,22/22,100%,,
+Aplasia/hypoplasia involving bones of the extremities [HP:0045060],55/55,100%,22/22,100%,,
+Aplasia/hypoplasia of the extremities [HP:0009815],55/55,100%,22/22,100%,,
+Abnormality of limbs [HP:0040064],73/73,100%,34/34,100%,,
+Aplasia/hypoplasia involving the skeleton [HP:0009115],56/56,100%,23/23,100%,,
+Abnormal skeletal morphology [HP:0011842],73/73,100%,35/35,100%,,
+Abnormality of the skeletal system [HP:0000924],73/73,100%,35/35,100%,,
+Abnormality of the musculoskeletal system [HP:0033127],74/74,100%,35/35,100%,,
+Abnormal limb bone morphology [HP:0002813],63/63,100%,34/34,100%,,
+Abnormality of limb bone [HP:0040068],63/63,100%,34/34,100%,,
+Abnormal appendicular skeleton morphology [HP:0011844],64/64,100%,34/34,100%,,
+Abnormality of the upper limb [HP:0002817],73/73,100%,34/34,100%,,
+Abnormal hand morphology [HP:0005922],53/53,100%,20/20,100%,,
+Abnormality of the hand [HP:0001155],60/60,100%,31/31,100%,,
+Abnormal finger morphology [HP:0001167],36/36,100%,31/31,100%,,
+Abnormal digit morphology [HP:0011297],38/38,100%,33/33,100%,,
+Abnormal thumb morphology [HP:0001172],30/30,100%,31/31,100%,,
+Short finger [HP:0009381],11/11,100%,8/8,100%,,
+Short digit [HP:0011927],11/11,100%,10/10,100%,,
+Abnormality of thumb phalanx [HP:0009602],13/13,100%,13/13,100%,,
+Forearm undergrowth [HP:0009821],30/30,100%,7/7,100%,,
+Upper limb undergrowth [HP:0009824],33/33,100%,7/7,100%,,
+Limb undergrowth [HP:0009826],33/33,100%,7/7,100%,,
+Aplasia/hypoplasia involving forearm bones [HP:0006503],37/37,100%,12/12,100%,,
+Abnormal forearm bone morphology [HP:0040072],37/37,100%,14/14,100%,,
+Abnormal upper limb bone morphology [HP:0040070],40/40,100%,14/14,100%,,
+Abnormal forearm morphology [HP:0002973],37/37,100%,14/14,100%,,
+Short long bone [HP:0003026],35/35,100%,9/9,100%,,
+Abnormal long bone morphology [HP:0011314],44/44,100%,13/13,100%,,
+Aplasia/Hypoplasia of the radius [HP:0006501],37/37,100%,11/11,100%,,
+Abnormal morphology of the radius [HP:0002818],37/37,100%,13/13,100%,,
+Aplasia of the fingers [HP:0009380],15/15,100%,14/14,100%,,
+Tricuspid regurgitation [HP:0005180],3/3,100%,0/0,0%,,
+Atrioventricular valve regurgitation [HP:0034376],4/4,100%,2/2,100%,,
+Abnormal atrioventricular valve physiology [HP:0031650],4/4,100%,2/2,100%,,
+Abnormal heart valve physiology [HP:0031653],4/4,100%,2/2,100%,,
+Abnormal cardiovascular system physiology [HP:0011025],23/23,100%,5/5,100%,,
+Abnormal tricuspid valve physiology [HP:0031651],3/3,100%,0/0,0%,,
+Deviation of the hand or of fingers of the hand [HP:0009484],2/2,100%,2/2,100%,,
+Cleft soft palate [HP:0000185],2/2,100%,0/0,0%,,
+Abnormal soft palate morphology [HP:0100736],2/2,100%,0/0,0%,,
+Abnormal palate morphology [HP:0000174],5/5,100%,0/0,0%,,
+Abnormal oral cavity morphology [HP:0000163],5/5,100%,1/1,100%,,
+Abnormal oral morphology [HP:0031816],5/5,100%,1/1,100%,,
+Abnormality of the mouth [HP:0000153],5/5,100%,1/1,100%,,
+Abnormality of the face [HP:0000271],5/5,100%,1/1,100%,,
+Abnormality of the head [HP:0000234],5/5,100%,2/2,100%,,
+Abnormality of head or neck [HP:0000152],5/5,100%,2/2,100%,,
+Cleft palate [HP:0000175],2/2,100%,0/0,0%,,
+Orofacial cleft [HP:0000202],2/2,100%,0/0,0%,,
+Abnormal carpal morphology [HP:0001191],30/32,94%,0/0,0%,,
+Abnormality of the wrist [HP:0003019],30/30,100%,0/0,0%,,
+Abnormality of upper limb joint [HP:0009810],30/30,100%,6/6,100%,,
+Abnormal joint morphology [HP:0001367],31/31,100%,6/6,100%,,
+Mitral valve prolapse [HP:0001634],0/0,0%,1/1,100%,,
+Abnormal mitral valve morphology [HP:0001633],0/0,0%,1/1,100%,,
+Abnormal atrioventricular valve morphology [HP:0006705],0/0,0%,1/1,100%,,
+Abnormal heart valve morphology [HP:0001654],0/0,0%,1/1,100%,,
+Mitral regurgitation [HP:0001653],1/1,100%,2/2,100%,,
+Abnormal mitral valve physiology [HP:0031481],1/1,100%,2/2,100%,,
+Limited pronation/supination of forearm [HP:0006394],0/0,0%,3/3,100%,,
+Limited elbow movement [HP:0002996],0/0,0%,4/4,100%,,
+Abnormality of the elbow [HP:0009811],0/0,0%,5/5,100%,,
+Limitation of joint mobility [HP:0001376],0/0,0%,4/4,100%,,
+Abnormality of joint mobility [HP:0011729],1/1,100%,5/5,100%,,
+Abnormal joint physiology [HP:0034430],1/1,100%,5/5,100%,,
+Abnormal musculoskeletal physiology [HP:0011843],1/1,100%,5/5,100%,,
+Abnormality of cardiovascular system electrophysiology [HP:0030956],15/15,100%,3/3,100%,,
+Upper limb phocomelia [HP:0009813],8/85,9%,2/37,5%,,
+Phocomelia [HP:0009829],8/8,100%,2/2,100%,,
+Pre-capillary pulmonary hypertension [HP:0033578],4/4,100%,0/0,0%,,
+Elevated pulmonary artery pressure [HP:0004890],4/4,100%,0/0,0%,,
+Abnormality of pulmonary circulation [HP:0030875],4/4,100%,0/0,0%,,
+Abnormal vascular physiology [HP:0030163],4/4,100%,0/0,0%,,
+Abnormality of the vasculature [HP:0002597],10/10,100%,2/2,100%,,
+Abnormal respiratory system physiology [HP:0002795],4/4,100%,0/0,0%,,
+Abnormality of the respiratory system [HP:0002086],4/4,100%,0/0,0%,,
+Aplasia of the 1st metacarpal [HP:0010035],0/0,0%,0/0,0%,,
+Aplasia of the proximal phalanges of the hand [HP:0010242],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia of the proximal phalanges of the hand [HP:0009851],0/0,0%,0/0,0%,,
+Abnormal proximal phalanx morphology of the hand [HP:0009834],0/0,0%,0/0,0%,,
+Abnormal finger phalanx morphology [HP:0005918],3/3,100%,0/0,0%,,
+Aplasia/Hypoplasia of the phalanges of the hand [HP:0009767],0/0,0%,0/0,0%,,
+Aplasia of the phalanges of the hand [HP:0009802],0/0,0%,0/0,0%,,
+Aplasia involving bones of the upper limbs [HP:0009823],0/0,0%,0/0,0%,,
+Aplasia involving bones of the extremities [HP:0009825],0/0,0%,0/0,0%,,
+Aplasia of metacarpal bones [HP:0010048],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia involving the metacarpal bones [HP:0005914],0/0,0%,0/0,0%,,
+Abnormal metacarpal morphology [HP:0005916],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia of the 1st metacarpal [HP:0010026],0/0,0%,0/0,0%,,
+Abnormal 1st metacarpal morphology [HP:0010009],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia of the phalanges of the thumb [HP:0009658],0/0,0%,0/0,0%,,
+Partial absence of thumb [HP:0009659],0/0,0%,0/0,0%,,
+1-2 finger syndactyly [HP:0010704],3/3,100%,1/1,100%,,
+Finger syndactyly [HP:0006101],4/4,100%,2/2,100%,,
+Syndactyly [HP:0001159],4/4,100%,2/2,100%,,
+Complete atrioventricular canal defect [HP:0001674],5/37,14%,3/36,8%,,
+Atrioventricular canal defect [HP:0006695],5/5,100%,3/3,100%,,
+Persistent left superior vena cava [HP:0005301],2/37,5%,0/0,0%,,
+Abnormal superior vena cava morphology [HP:0025575],2/2,100%,0/0,0%,,
+Abnormal vena cava morphology [HP:0005345],2/2,100%,0/0,0%,,
+Abnormal morphology of the great vessels [HP:0030962],6/6,100%,2/2,100%,,
+Abnormal blood vessel morphology [HP:0033353],6/6,100%,2/2,100%,,
+Abnormal vascular morphology [HP:0025015],6/6,100%,2/2,100%,,
+Abnormal venous morphology [HP:0002624],2/2,100%,0/0,0%,,
+Abnormal shoulder morphology [HP:0003043],1/1,100%,1/1,100%,,
+Abnormal thorax morphology [HP:0000765],6/6,100%,5/5,100%,,
+Abnormal axial skeleton morphology [HP:0009121],8/8,100%,5/5,100%,,
+Aplasia/hypoplasia of the humerus [HP:0006507],7/7,100%,4/4,100%,,
+Abnormality of the humerus [HP:0003063],7/7,100%,4/4,100%,,
+Abnormality of the upper arm [HP:0001454],7/7,100%,4/4,100%,,
+Right atrial enlargement [HP:0030718],4/4,100%,0/0,0%,,
+Abnormal right atrium morphology [HP:0025580],4/4,100%,0/0,0%,,
+Atrial septal dilatation [HP:0011995],4/4,100%,0/0,0%,,
+Pectus excavatum [HP:0000767],3/4,75%,2/2,100%,,
+Abnormal sternum morphology [HP:0000766],3/3,100%,2/2,100%,,
+Postaxial hand polydactyly [HP:0001162],3/4,75%,0/0,0%,,
+Postaxial polydactyly [HP:0100259],3/3,100%,0/0,0%,,
+Polydactyly [HP:0010442],3/3,100%,0/0,0%,,
+Abnormal 5th finger morphology [HP:0004207],4/4,100%,0/0,0%,,
+Hand polydactyly [HP:0001161],3/3,100%,0/0,0%,,
+Duplication of phalanx of hand [HP:0009997],3/3,100%,0/0,0%,,
+Duplication of hand bones [HP:0004275],3/3,100%,0/0,0%,,
+Duplication of bones involving the upper extremities [HP:0009142],3/3,100%,0/0,0%,,
+High palate [HP:0000218],3/3,100%,0/0,0%,,
+Short neck [HP:0000470],3/3,100%,0/0,0%,,
+Abnormal neck morphology [HP:0025668],3/3,100%,0/0,0%,,
+Abnormality of the neck [HP:0000464],3/3,100%,0/0,0%,,
+Abnormality of the cervical spine [HP:0003319],3/3,100%,0/0,0%,,
+Abnormality of the vertebral column [HP:0000925],4/4,100%,1/1,100%,,
+Shield chest [HP:0000914],3/3,100%,0/0,0%,,
+Enlarged thorax [HP:0100625],3/3,100%,0/0,0%,,
+Abnormal rib cage morphology [HP:0001547],4/4,100%,0/0,0%,,
+Y-shaped metatarsals [HP:0010567],3/3,100%,0/0,0%,,
+Abnormal metatarsal morphology [HP:0001832],3/3,100%,0/0,0%,,
+Abnormal lower limb bone morphology [HP:0040069],3/3,100%,0/0,0%,,
+Abnormality of the lower limb [HP:0002814],3/3,100%,0/0,0%,,
+Abnormal foot morphology [HP:0001760],3/3,100%,0/0,0%,,
+Upper extremity joint dislocation [HP:0030310],0/0,0%,2/2,100%,,
+Joint dislocation [HP:0001373],0/0,0%,2/2,100%,,
+Clinodactyly of the 5th finger [HP:0004209],1/1,100%,0/0,0%,,
+Finger clinodactyly [HP:0040019],1/1,100%,0/0,0%,,
+Clinodactyly [HP:0030084],1/1,100%,0/0,0%,,
+Deviation of finger [HP:0004097],1/1,100%,2/2,100%,,
+Deviation of the 5th finger [HP:0009179],1/1,100%,0/0,0%,,
+Proximal placement of thumb [HP:0009623],0/0,0%,0/0,0%,,
+Deviation of the thumb [HP:0009603],0/0,0%,2/2,100%,,
+Abnormal 2nd finger morphology [HP:0004100],1/1,100%,0/0,0%,,
+Aplasia/Hypoplasia of the 2nd finger [HP:0006264],1/1,100%,0/0,0%,,
+Short middle phalanx of finger [HP:0005819],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia of the middle phalanges of the hand [HP:0009843],0/0,0%,0/0,0%,,
+Abnormal middle phalanx morphology of the hand [HP:0009833],0/0,0%,0/0,0%,,
+Short phalanx of finger [HP:0009803],0/0,0%,0/0,0%,,
+Short middle phalanx of the 5th finger [HP:0004220],0/0,0%,0/0,0%,,
+Type A brachydactyly [HP:0009370],0/0,0%,0/0,0%,,
+Brachydactyly [HP:0001156],0/0,0%,0/0,0%,,
+Short 5th finger [HP:0009237],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia of the 5th finger [HP:0006262],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia of the middle phalanx of the 5th finger [HP:0009161],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia of the phalanges of the 5th finger [HP:0009376],0/0,0%,0/0,0%,,
+Abnormal 5th finger phalanx morphology [HP:0004213],0/0,0%,0/0,0%,,
+Abnormality of the middle phalanx of the 5th finger [HP:0004219],0/0,0%,0/0,0%,,
+Congenital malformation of the great arteries [HP:0011603],4/4,100%,2/2,100%,,
+Aplasia involving forearm bones [HP:0009822],7/7,100%,6/6,100%,,
+Absent forearm bone [HP:0003953],7/7,100%,6/6,100%,,
+Aplasia/Hypoplasia of the ulna [HP:0006495],2/2,100%,2/2,100%,,
+Abnormal morphology of ulna [HP:0040071],2/2,100%,4/4,100%,,
+Hypoplasia of deltoid muscle [HP:0030241],0/0,0%,0/0,0%,,
+Shoulder muscle hypoplasia [HP:0008952],0/0,0%,0/0,0%,,
+Hypoplasia of the musculature [HP:0009004],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia involving the skeletal musculature [HP:0001460],0/0,0%,0/0,0%,,
+Abnormality of muscle size [HP:0030236],0/0,0%,0/0,0%,,
+Abnormal skeletal muscle morphology [HP:0011805],2/2,100%,0/0,0%,,
+Abnormality of the musculature [HP:0003011],2/2,100%,0/0,0%,,
+Aplasia/Hypoplasia involving the shoulder musculature [HP:0001464],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia involving the musculature of the upper limbs [HP:0001467],0/0,0%,0/0,0%,,
+Aplasia/Hypoplasia involving the musculature of the extremities [HP:0009128],0/0,0%,0/0,0%,,
+Abnormality of the musculature of the limbs [HP:0009127],2/2,100%,0/0,0%,,
+Abnormality of the musculature of the upper limbs [HP:0001446],2/2,100%,0/0,0%,,
+Abnormality of the shoulder girdle musculature [HP:0001435],0/0,0%,0/0,0%,,
+Atrioventricular dissociation [HP:0011709],0/22,0%,1/1,100%,,
+Abnormal toe morphology [HP:0001780],0/0,0%,0/0,0%,,
+Abnormal toe phalanx morphology [HP:0010161],0/0,0%,0/0,0%,,
+Abnormality of the distal phalanges of the toes [HP:0010182],0/0,0%,0/0,0%,,
+Patent foramen ovale [HP:0001655],4/40,10%,0/36,0%,,
+Synostosis of joints [HP:0100240],1/1,100%,1/1,100%,,
+Short 1st metacarpal [HP:0010034],0/30,0%,0/22,0%,,
+Short phalanx of the thumb [HP:0009660],0/30,0%,0/22,0%,,
+11 pairs of ribs [HP:0000878],1/1,100%,0/0,0%,,
+Missing ribs [HP:0000921],1/1,100%,0/0,0%,,
+Aplasia/Hypoplasia of the ribs [HP:0006712],1/1,100%,0/0,0%,,
+Aplasia/Hypoplasia involving bones of the thorax [HP:0006711],1/1,100%,2/2,100%,,
+Aplasia/hypoplasia affecting bones of the axial skeleton [HP:0009122],2/2,100%,2/2,100%,,
+Abnormal rib morphology [HP:0000772],1/1,100%,0/0,0%,,
+First degree atrioventricular block [HP:0011705],0/22,0%,1/1,100%,,
+Micrognathia [HP:0000347],1/1,100%,1/1,100%,,
+Aplasia/Hypoplasia of the mandible [HP:0009118],1/1,100%,1/1,100%,,
+Aplasia/Hypoplasia involving bones of the skull [HP:0009116],1/1,100%,1/1,100%,,
+Abnormal skull morphology [HP:0000929],1/1,100%,2/2,100%,,
+Abnormal mandible morphology [HP:0000277],1/1,100%,1/1,100%,,
+Abnormal jaw morphology [HP:0030791],1/1,100%,1/1,100%,,
+Abnormal facial skeleton morphology [HP:0011821],1/1,100%,1/1,100%,,
+Bowed forearm bones [HP:0003956],0/0,0%,1/1,100%,,
+Bowing of the arm [HP:0006488],0/0,0%,1/1,100%,,
+Bowing of the long bones [HP:0006487],0/0,0%,1/1,100%,,
+Abnormal diaphysis morphology [HP:0000940],0/0,0%,1/1,100%,,
+Common atrium [HP:0011565],0/83,0%,0/38,0%,,
+Unroofed coronary sinus [HP:0031297],0/85,0%,0/38,0%,,
+Amelia involving the upper limbs [HP:0009812],0/83,0%,1/37,3%,,
+Third degree atrioventricular block [HP:0001709],0/22,0%,1/1,100%,,
+Small hypothenar eminence [HP:0010487],2/2,100%,0/0,0%,,
+Abnormality of the hypothenar eminence [HP:0010486],2/2,100%,0/0,0%,,
+Abnormality of the musculature of the hand [HP:0001421],2/2,100%,0/0,0%,,
+Hypoplastic scapulae [HP:0000882],1/1,100%,1/1,100%,,
+Aplasia/Hypoplasia of the scapulae [HP:0006713],1/1,100%,1/1,100%,,
+Abnormal scapula morphology [HP:0000782],1/1,100%,1/1,100%,,
+Left ventricular noncompaction cardiomyopathy [HP:0011664],0/1,0%,1/5,20%,,
+Noncompaction cardiomyopathy [HP:0012817],0/0,0%,1/1,100%,,
+Cardiomyopathy [HP:0001638],0/0,0%,1/1,100%,,
+Abnormal myocardium morphology [HP:0001637],0/0,0%,1/1,100%,,
+Sinus bradycardia [HP:0001688],0/0,0%,1/1,100%,,
+Abnormal electrophysiology of sinoatrial node origin [HP:0011702],0/0,0%,1/1,100%,,
+Arrhythmia [HP:0011675],1/1,100%,1/1,100%,,
+Bradycardia [HP:0001662],0/0,0%,1/1,100%,,
+Abnormal skin morphology [HP:0011121],0/0,0%,1/1,100%,,
+Abnormality of the skin [HP:0000951],0/0,0%,1/1,100%,,
+Abnormality of the integument [HP:0001574],0/0,0%,1/1,100%,,
+Sinus venosus atrial septal defect [HP:0011567],0/2,0%,1/1,100%,,
diff --git a/docs/report/tbx5_mtc_report.html b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html
similarity index 91%
rename from docs/report/tbx5_mtc_report.html
rename to docs/report/tbx5_frameshift_vs_missense.mtc_report.html
index 175a0245..78345301 100644
--- a/docs/report/tbx5_mtc_report.html
+++ b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html
@@ -48,9 +48,9 @@
Phenotype testing report
Phenotype MTC filter: HPO MTC filter
Multiple testing correction: fdr_bh
- Performed statistical tests for 15 out of the total of 259 HPO terms.
+ Performed statistical tests for 16 out of the total of 259 HPO terms.
- Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.
+ Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.
Code |
@@ -61,15 +61,15 @@ Phenotype testing report
TODO |
- Skipping term because all genotypes have same HPO observed proportions |
- 42 |
+ Skipping general term |
+ 43 |
TODO |
- Skipping general term |
- 41 |
+ Skipping term because all genotypes have same HPO observed proportions |
+ 42 |
@@ -135,13 +135,6 @@ Phenotype testing report
3 |
-
-
- TODO |
- Skipping non-target term |
- 3 |
-
-
TODO |
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 9fad88dd..a2bc3230 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -9,16 +9,16 @@ The tutorial illustrates just one of the many ways GPSEA can be used to characte
The :ref:`user-guide` contains details about additional methods and functionalities.
-The tutorial presents an analysis of a cohort of individuals with pathogenic variants in *TBX5* leading to
-`Holt-Oram syndrome MIM:142900 `_.
+The tutorial presents an analysis of a cohort of individuals with pathogenic variants in *TBX5* leading to
+`Holt-Oram syndrome MIM:142900 `_.
-Holt-Oram syndrome is an autosomal dominant disorder characterized by
+Holt-Oram syndrome is an autosomal dominant disorder characterized by
upper limb defects, congenital heart defects, and arrhythmias (`PMID:38336121 `_).
It has been observed in the literature that congenital defects of the ventricular and atrial septum are more
common in the truncating than in the missense variants (`PMID:30552424 `_).
Additionally, upper limb defects are more frequent in patients with protein-truncating variants (`PMID:38336121 `_).
-To perform the analysis, we curated the literature and created on `GA4GH phenopacket `_ for each
+To perform the analysis, we curated the literature and created on `GA4GH phenopacket `_ for each
affected individual. The phenopackets are made available in `Phenopacket Store `_.
@@ -26,9 +26,9 @@ affected individual. The phenopackets are made available in `Phenopacket Store <
The analysis
~~~~~~~~~~~~
-For the analysis, the `MANE `_ transcript
+For the analysis, the `MANE `_ transcript
(i.e., the "main" biomedically relevant transcript of a gene) should be chosen unless
-there is a specific reason not to (which should occur rarely if at all).
+there is a specific reason not to (which should occur rarely if at all).
In the case of *TBX5* the MANE transcript is `NM_181486.4`. Note that the trascript identifier (`NM_181486`) and the version (`4`) are both required.
A good way to find the MANE transcript is to search on the gene symbol (e.g., *TBX5*) in `ClinVar `_ and to
@@ -49,7 +49,7 @@ corresponding protein accession `NP_852259.1`.
Load HPO
^^^^^^^^
-GPSEA needs HPO to do the analysis.
+GPSEA needs HPO to do the analysis.
We use HPO toolkit to load HPO version `v2023-10-09`:
>>> import hpotk
@@ -75,13 +75,13 @@ and stored in `Phenopacket Store >> from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets
>>> cohort_creator = configure_caching_cohort_creator(hpo)
>>> cohort, validation = load_phenopackets( # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
-... phenopackets=phenopackets,
+... phenopackets=phenopackets,
... cohort_creator=cohort_creator,
... )
Patients Created: ...
@@ -96,7 +96,7 @@ We loaded the patient data into a `cohort` which is ready for the next steps.
.. seealso::
- Here we show how to create a :class:`~gpsea.model.Cohort` from phenopackets.
+ Here we show how to create a :class:`~gpsea.model.Cohort` from phenopackets.
See :ref:`input-data` section to learn how to create a cohort from another inputs.
@@ -113,7 +113,7 @@ We can now explore the cohort to see how many patients are included.
.. raw:: html
:file: report/tbx5_cohort_info.html
-
+
.. note::
The report can also be displayed directly in a Jupyter notebook by running::
@@ -123,14 +123,14 @@ We can now explore the cohort to see how many patients are included.
Now we can show the distribution of variants with respect to the encoded protein.
We first obtain `tx_coordinates` (:class:`~gpsea.model.TranscriptCoordinates`)
-and `protein_meta` (:class:`~gpsea.model.ProteinMetadata`)
+and `protein_meta` (:class:`~gpsea.model.ProteinMetadata`)
with information about the transcript and protein "anatomy":
>>> from gpsea.model.genome import GRCh38
>>> from gpsea.preprocessing import configure_protein_metadata_service, VVMultiCoordinateService
>>> txc_service = VVMultiCoordinateService(genome_build=GRCh38)
>>> pms = configure_protein_metadata_service()
->>> tx_coordinates = txc_service.fetch(tx_id)
+>>> tx_coordinates = txc_service.fetch(tx_id)
>>> protein_meta = pms.annotate(px_id)
and we follow with plotting the diagram of the mutations on the protein:
@@ -158,7 +158,7 @@ Prepare genotype and phenotype predicates
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
We will create a predicate to bin patients into group
-depending on presence of a missense and frameshift variant to test
+depending on presence of a missense and frameshift variant to test
if there is a difference between frameshift and non-frameshift variants
in the individuals of the *TBX5* cohort.
@@ -176,74 +176,99 @@ in the individuals of the *TBX5* cohort.
.. note::
- There are many other ways to set up a predicate for testing
+ There are many other ways to set up a predicate for testing
for a GP correlation.
See the :ref:`predicates` section to learn more about building
a predicate of interest.
+The phenotype grouping is based on presence or absence of an HPO term.
+We take advantage of the ontology "true path rule" to infer presence
+of the ancestor terms for all present HPO terms
+(e.g. presence of `Abnormal ventricular septum morphology `_
+in an individual annotated with `Ventricular septal defect `_)
+and exclusion of the descendant terms for all excluded terms (e.g. exclusion of
+`Motor seizure `_
+in an individual where `Seizure `_
+was excluded):
+
+>>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest
+>>> pheno_predicates = prepare_predicates_for_terms_of_interest(
+... cohort=cohort,
+... hpo=hpo,
+... min_n_of_patients_with_term=2,
+... )
-By default, GPSEA will perform one hypothesis test for each HPO term used to annotate more than one individual in the cohort.
-This also includes the terms implied by the ontology "true path rule",
-which states that presence of a term
-(e.g., `Ventricular septal defect `_)
-implies presence of all its ancestor terms
-(e.g., `Abnormal ventricular septum morphology `_,
-`Abnormal cardiac septum morphology `_,
-`Abnormal cardiac ventricle morphology `_, ...).
-However, testing multiple hypothesis increases the chance of receiving false positive result,
-and multiple testing correction must be applied.
-See :ref:`mtc` for information about how to perform multiple testing correction with GPSEA.
+By default, GPSEA will perform one hypothesis test for each HPO term used to annotate two or more individuals in the cohort
+(see ``min_n_of_patients_with_term=2`` above).
+Testing multiple hypothesis on the same dataset increases the chance of receiving false positive result.
+However, GPSEA simplifies the application of an appropriate multiple testing correction.
For general use, we recommend using a combination
of a *Phenotype MTC filter* (:class:`~gpsea.analysis.PhenotypeMtcFilter`) with a *multiple testing correction*.
-Phenotype MTC filter chooses the HPO terms to test according to several heuristics, which
+Phenotype MTC filter chooses the HPO terms to test according to several heuristics, which
reduce the multiple testing burden and focus the analysis
on the most interesting terms (see :ref:`HPO MTC filter ` for more info).
Then the multiple testing correction, such as Bonferroni or Benjamini-Hochberg,
is used to control the family-wise error rate or the false discovery rate.
+See :ref:`mtc` for more information.
-Here we use HPO MTC filter (:meth:`~gpsea.analysis.CohortAnalysisConfiguration.hpo_mtc_strategy`)
-along with Benjamini-Hochberg procedure (:meth:`~gpsea.analysis.CohortAnalysisConfiguration.pval_correction`):
+In this example, we will use a combination of the HPO MTC filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`)
+with Benjamini-Hochberg procedure (``mtc_correction='fdr_bh'``)
+with a false discovery control level at (``mtc_alpha=0.05``):
->>> from gpsea.analysis import configure_cohort_analysis, CohortAnalysisConfiguration
->>> config = CohortAnalysisConfiguration()
->>> config.hpo_mtc_strategy()
->>> config.pval_correction = 'fdr_bh'
->>> analysis = configure_cohort_analysis(
-... cohort=cohort,
-... hpo=hpo,
-... config=config,
+>>> from gpsea.analysis.mtc_filter import HpoMtcFilter
+>>> mtc_filter = HpoMtcFilter.default_filter(hpo, term_frequency_threshold=0.2)
+>>> mtc_correction = 'fdr_bh'
+>>> mtc_alpha = 0.05
+
+Choosing the statistical procedure for assessment of association between genotype and phenotype
+groups is the last missing piece of the analysis. We will use Fisher Exact Test:
+
+>>> from gpsea.analysis.pcats.stats import ScipyFisherExact
+>>> count_statistic = ScipyFisherExact()
+
+and we finalize the analysis setup by putting all components together
+into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`:
+
+>>> from gpsea.analysis.pcats import HpoTermAnalysis
+>>> analysis = HpoTermAnalysis(
+... count_statistic=count_statistic,
+... mtc_filter=mtc_filter,
+... mtc_correction=mtc_correction,
+... mtc_alpha=mtc_alpha,
... )
Now we can perform the analysis and investigate the results.
->>> result = analysis.compare_genotype_vs_cohort_phenotypes(gt_predicate)
+>>> result = analysis.compare_genotype_vs_phenotypes(
+... cohort=cohort,
+... gt_predicate=gt_predicate,
+... pheno_predicates=pheno_predicates,
+... )
>>> result.total_tests
16
We only tested 16 HPO terms. This is despite the individuals being collectively annotated with
259 direct and indirect HPO terms
->>> from gpsea.analysis import prepare_hpo_terms_of_interest
->>> terms = prepare_hpo_terms_of_interest(hpo, cohort.all_patients, min_n_of_patients_with_term=2)
->>> len(terms)
+>>> len(result.phenotypes)
259
We can show the reasoning behind *not* testing 243 (`259 - 16`) HPO terms
by exploring the phenotype MTC filtering report.
>>> from gpsea.view import MtcStatsViewer
->>> mtc_viewer = MtcStatsViewer()
->>> mtc_report = mtc_viewer.process(result.mtc_filter_report)
->>> with open('docs/report/tbx5_mtc_report.html', 'w') as fh: # doctest: +SKIP
+>>> mtc_viewer = MtcStatsViewer()
+>>> mtc_report = mtc_viewer.process(result)
+>>> with open('docs/report/tbx5_frameshift_vs_missense.mtc_report.html', 'w') as fh: # doctest: +SKIP
... _ = fh.write(mtc_report)
.. raw:: html
- :file: report/tbx5_mtc_report.html
+ :file: report/tbx5_frameshift_vs_missense.mtc_report.html
..
-
- TODO:
+
+ TODO:
Show how to write out the tested HPO terms.
and these are the HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure:
@@ -259,8 +284,8 @@ and these are the HPO terms ordered by the p value corrected with the Benjamini-
We see that several HPO terms are significantly associated
with presence of a frameshift variant in *TBX5*.
For example, `Ventricular septal defect `_
-was observed in 31/60 (52%) patients with a missense variant
+was observed in 31/60 (52%) patients with a missense variant
but it was observed in 19/19 (100%) patients with a frameshift variant.
-Fisher exact test computed a p value of `~0.0000562`
-and the p value corrected by Benjamini-Hochberg procedure
+Fisher exact test computed a p value of `~0.0000562`
+and the p value corrected by Benjamini-Hochberg procedure
is `~0.00112`.
diff --git a/docs/user-guide/glossary.rst b/docs/user-guide/glossary.rst
new file mode 100644
index 00000000..bf8f9d90
--- /dev/null
+++ b/docs/user-guide/glossary.rst
@@ -0,0 +1,89 @@
+.. _glossary:
+
+========
+Glossary
+========
+
+The glossary summarizes several frequently used concepts.
+
+.. _true-path-rule:
+
+True path rule
+~~~~~~~~~~~~~~
+
+The true path rule of ontologies states that an item (e.g. an individual) annotated with an ontology term
+(e.g. `Focal-onset seizure `_)
+is implicitly annotated with all its *ancestor* terms
+(`Seizure `_,
+`Abnormal nervous system physiology `_, ...).
+Conversely, exclusion of a term (e.g. `Abnormal ventricular septum morphology `_)
+implies exclusion of all its *descendants*
+(`Ventricular septal defect `_,
+`Ventricular septal aneurysm `_, ...).
+
+
+.. _length-of-the-reference-allele:
+
+Length of the reference allele
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The length of the REF allele corresponds to the length of the genomic region affected by the variant.
+
+Let's show a few examples.
+
+>>> from gpsea.model import VariantCoordinates
+>>> from gpsea.model.genome import GRCh38
+>>> chr1 = GRCh38.contig_by_name("chr1")
+
+The length of the reference allele of a missense variant is 1
+because the variant affects a 1-bp region spanned by the ``C`` nucleotide:
+
+>>> missense = VariantCoordinates.from_vcf_literal(chr1, 1001, 'C', 'T')
+>>> len(missense)
+1
+
+The length of a "small" deletion is the same as the length of the ref allele `str`:
+(``'CCC'`` in the example below):
+
+>>> deletion = VariantCoordinates.from_vcf_literal(chr1, 1001, 'CCC', 'C')
+>>> len(deletion)
+3
+
+This is because the literal notation spells out the alleles.
+However, this simple rule does not apply in symbolic notation.
+Here, the REF length corresponds to the length of the allele region.
+
+For instance, for the following structural variant::
+
+ #CHROM POS ID REF ALT QUAL FILTER INFO
+ 1 1001 . C 6 PASS SVTYPE=DEL;END=1003;SVLEN=-3
+
+the length of the REF allele is `3`:
+
+>>> sv_deletion = VariantCoordinates.from_vcf_symbolic(
+... chr1, pos=1001, end=1003,
+... ref='C', alt='', svlen=-3,
+... )
+>>> len(sv_deletion)
+3
+
+because the deletion removes 3 base pairs at the coordinates :math:`[1001, 1003]`.
+
+
+.. _change-length-of-an-allele:
+
+Change length of an allele
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Change length is defined as the difference between lengths of the *alt* and *ref* alleles.
+SNVs lead to change length of zero, deletions and insertions/duplications lead to negative
+and positive change lengths, respectively. See the table below for examples.
+
+================== ================== ===============
+ Reference allele Alternate allele Change length
+================== ================== ===============
+ C T 0
+ CG AT 0
+ ACTG A -3
+ A AAT 2
+================== ================== ===============
diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst
index 923b1834..9f7933ee 100644
--- a/docs/user-guide/index.rst
+++ b/docs/user-guide/index.rst
@@ -13,6 +13,7 @@ TODO - write high level overview and bridge to individual sections.
input-data
exploratory
predicates
- mtc
stats
+ mtc
+ glossary
autosomal_recessive
diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst
index 30ee95df..7aa3e23e 100644
--- a/docs/user-guide/mtc.rst
+++ b/docs/user-guide/mtc.rst
@@ -104,6 +104,8 @@ This is how we can set an alternative MTC correction procedure:
'fdr_bh'
+.. _mtc-filters:
+
MTC filters: Choosing which terms to test
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv
new file mode 100644
index 00000000..d794cec4
--- /dev/null
+++ b/docs/user-guide/report/tbx5_frameshift.csv
@@ -0,0 +1,262 @@
+FRAMESHIFT_VARIANT on NM_181486.4,Yes,Yes,No,No,,
+,Count,Percent,Count,Percent,Corrected p values,p values
+Ventricular septal defect [HP:0001629],19/19,100%,42/71,59%,0.004112753923262261,0.00024192670136836828
+Abnormal atrioventricular conduction [HP:0005150],3/3,100%,1/23,4%,0.013076923076923076,0.0015384615384615385
+Absent thumb [HP:0009777],14/31,45%,18/100,18%,0.021044138590779585,0.0037136715160199273
+Atrioventricular block [HP:0001678],2/2,100%,1/23,4%,0.034,0.010000000000000002
+Heart block [HP:0012722],2/2,100%,1/23,4%,0.034,0.010000000000000002
+Patent ductus arteriosus [HP:0001643],2/2,100%,6/40,15%,0.09214092140921408,0.03252032520325203
+Secundum atrial septal defect [HP:0001684],4/22,18%,23/55,42%,0.1440020479198931,0.06544319142266644
+Triphalangeal thumb [HP:0001199],13/32,41%,23/99,23%,0.1440020479198931,0.06932119159387057
+Cardiac conduction abnormality [HP:0031546],3/3,100%,15/37,41%,0.1440020479198931,0.08259109311740892
+Muscular ventricular septal defect [HP:0011623],6/25,24%,8/84,10%,0.1440020479198931,0.08470708701170182
+Pulmonary arterial hypertension [HP:0002092],0/2,0%,8/14,57%,0.6899307928951144,0.4666666666666667
+Short thumb [HP:0009778],8/30,27%,25/69,36%,0.6899307928951144,0.4870099714553749
+Absent radius [HP:0003974],6/25,24%,9/43,21%,1.0,0.7703831604944444
+Atrial septal defect [HP:0001631],20/20,100%,63/65,97%,1.0,1.0
+Hypoplasia of the radius [HP:0002984],6/14,43%,34/75,45%,1.0,1.0
+Short humerus [HP:0005792],4/9,44%,8/21,38%,1.0,1.0
+Hypoplasia of the ulna [HP:0003022],2/10,20%,3/17,18%,1.0,1.0
+Aplasia/Hypoplasia of the thumb [HP:0009601],19/19,100%,40/40,100%,,
+Aplasia/Hypoplasia of fingers [HP:0006265],19/19,100%,44/44,100%,,
+Aplasia/hypoplasia involving bones of the hand [HP:0005927],19/19,100%,44/44,100%,,
+Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],22/22,100%,78/78,100%,,
+Aplasia/hypoplasia involving bones of the extremities [HP:0045060],22/22,100%,78/78,100%,,
+Aplasia/hypoplasia of the extremities [HP:0009815],22/22,100%,78/78,100%,,
+Abnormality of limbs [HP:0040064],34/34,100%,102/102,100%,,
+Phenotypic abnormality [HP:0000118],38/38,100%,114/114,100%,,
+All [HP:0000001],38/38,100%,114/114,100%,,
+Aplasia/hypoplasia involving the skeleton [HP:0009115],23/23,100%,80/80,100%,,
+Abnormal skeletal morphology [HP:0011842],35/35,100%,103/103,100%,,
+Abnormality of the skeletal system [HP:0000924],35/35,100%,103/103,100%,,
+Abnormality of the musculoskeletal system [HP:0033127],35/35,100%,104/104,100%,,
+Abnormal limb bone morphology [HP:0002813],34/34,100%,92/92,100%,,
+Abnormality of limb bone [HP:0040068],34/34,100%,92/92,100%,,
+Abnormal appendicular skeleton morphology [HP:0011844],34/34,100%,93/93,100%,,
+Abnormality of the upper limb [HP:0002817],34/34,100%,102/102,100%,,
+Abnormal hand morphology [HP:0005922],20/20,100%,75/75,100%,,
+Abnormality of the hand [HP:0001155],31/31,100%,88/88,100%,,
+Abnormal finger morphology [HP:0001167],31/31,100%,64/64,100%,,
+Abnormal digit morphology [HP:0011297],33/33,100%,67/67,100%,,
+Abnormal thumb morphology [HP:0001172],31/31,100%,58/58,100%,,
+Short finger [HP:0009381],8/8,100%,27/27,100%,,
+Short digit [HP:0011927],10/10,100%,28/28,100%,,
+Abnormality of thumb phalanx [HP:0009602],13/13,100%,26/26,100%,,
+Congenital malformation of the great arteries [HP:0011603],2/2,100%,7/7,100%,,
+Abnormal morphology of the great vessels [HP:0030962],2/2,100%,10/10,100%,,
+Abnormal blood vessel morphology [HP:0033353],2/2,100%,11/11,100%,,
+Abnormal vascular morphology [HP:0025015],2/2,100%,11/11,100%,,
+Abnormal cardiovascular system morphology [HP:0030680],30/30,100%,92/92,100%,,
+Abnormality of the cardiovascular system [HP:0001626],32/32,100%,94/94,100%,,
+Abnormality of the vasculature [HP:0002597],2/2,100%,17/17,100%,,
+Aplasia of the 1st metacarpal [HP:0010035],0/0,0%,3/3,100%,,
+Aplasia of the proximal phalanges of the hand [HP:0010242],0/0,0%,3/3,100%,,
+Aplasia/Hypoplasia of the proximal phalanges of the hand [HP:0009851],0/0,0%,3/3,100%,,
+Abnormal proximal phalanx morphology of the hand [HP:0009834],0/0,0%,3/3,100%,,
+Abnormal finger phalanx morphology [HP:0005918],0/0,0%,9/9,100%,,
+Aplasia/Hypoplasia of the phalanges of the hand [HP:0009767],0/0,0%,6/6,100%,,
+Aplasia of the phalanges of the hand [HP:0009802],0/0,0%,3/3,100%,,
+Aplasia involving bones of the upper limbs [HP:0009823],0/0,0%,3/3,100%,,
+Aplasia involving bones of the extremities [HP:0009825],0/0,0%,3/3,100%,,
+Finger aplasia [HP:0009380],14/14,100%,23/23,100%,,
+Aplasia of metacarpal bones [HP:0010048],0/0,0%,3/3,100%,,
+Aplasia/Hypoplasia involving the metacarpal bones [HP:0005914],0/0,0%,4/4,100%,,
+Abnormal metacarpal morphology [HP:0005916],0/0,0%,4/4,100%,,
+Abnormal upper limb bone morphology [HP:0040070],14/14,100%,50/50,100%,,
+Aplasia/Hypoplasia of the 1st metacarpal [HP:0010026],0/0,0%,4/4,100%,,
+Abnormal 1st metacarpal morphology [HP:0010009],0/0,0%,4/4,100%,,
+Aplasia/Hypoplasia of the phalanges of the thumb [HP:0009658],0/0,0%,4/4,100%,,
+Partial absence of thumb [HP:0009659],0/0,0%,3/3,100%,,
+Abnormal atrial septum morphology [HP:0011994],20/20,100%,64/64,100%,,
+Abnormal cardiac septum morphology [HP:0001671],28/28,100%,89/89,100%,,
+Abnormal heart morphology [HP:0001627],30/30,100%,89/89,100%,,
+Abnormal cardiac atrium morphology [HP:0005120],20/20,100%,64/64,100%,,
+Perimembranous ventricular septal defect [HP:0011682],3/25,12%,6/84,7%,,
+Abnormal ventricular septum morphology [HP:0010438],19/19,100%,42/42,100%,,
+Abnormal cardiac ventricle morphology [HP:0001713],19/19,100%,43/43,100%,,
+Forearm undergrowth [HP:0009821],7/7,100%,35/35,100%,,
+Upper limb undergrowth [HP:0009824],7/7,100%,38/38,100%,,
+Limb undergrowth [HP:0009826],7/7,100%,38/38,100%,,
+Aplasia/hypoplasia involving forearm bones [HP:0006503],12/12,100%,43/43,100%,,
+Abnormal forearm bone morphology [HP:0040072],14/14,100%,43/43,100%,,
+Abnormal forearm morphology [HP:0002973],14/14,100%,43/43,100%,,
+Short long bone [HP:0003026],9/9,100%,41/41,100%,,
+Abnormal long bone morphology [HP:0011314],13/13,100%,50/50,100%,,
+Aplasia/Hypoplasia of the radius [HP:0006501],11/11,100%,43/43,100%,,
+Abnormal morphology of the radius [HP:0002818],13/13,100%,43/43,100%,,
+Abnormal carpal morphology [HP:0001191],0/0,0%,30/32,94%,,
+Abnormality of the wrist [HP:0003019],0/0,0%,30/30,100%,,
+Abnormality of upper limb joint [HP:0009810],6/6,100%,32/32,100%,,
+Abnormal joint morphology [HP:0001367],6/6,100%,33/33,100%,,
+Persistent left superior vena cava [HP:0005301],0/0,0%,4/39,10%,,
+Abnormal superior vena cava morphology [HP:0025575],0/0,0%,4/4,100%,,
+Abnormal vena cava morphology [HP:0005345],0/0,0%,4/4,100%,,
+Abnormal venous morphology [HP:0002624],0/0,0%,4/4,100%,,
+Tricuspid regurgitation [HP:0005180],0/0,0%,5/5,100%,,
+Atrioventricular valve regurgitation [HP:0034376],2/2,100%,7/7,100%,,
+Abnormal atrioventricular valve physiology [HP:0031650],2/2,100%,7/7,100%,,
+Abnormal heart valve physiology [HP:0031653],2/2,100%,7/7,100%,,
+Abnormal cardiovascular system physiology [HP:0011025],5/5,100%,30/30,100%,,
+Abnormal tricuspid valve physiology [HP:0031651],0/0,0%,5/5,100%,,
+Micrognathia [HP:0000347],1/1,100%,1/1,100%,,
+Aplasia/Hypoplasia of the mandible [HP:0009118],1/1,100%,1/1,100%,,
+Aplasia/Hypoplasia involving bones of the skull [HP:0009116],1/1,100%,1/1,100%,,
+Aplasia/hypoplasia affecting bones of the axial skeleton [HP:0009122],2/2,100%,3/3,100%,,
+Abnormal axial skeleton morphology [HP:0009121],5/5,100%,9/9,100%,,
+Abnormal skull morphology [HP:0000929],2/2,100%,1/1,100%,,
+Abnormality of the head [HP:0000234],2/2,100%,5/5,100%,,
+Abnormality of head or neck [HP:0000152],2/2,100%,5/5,100%,,
+Abnormal mandible morphology [HP:0000277],1/1,100%,1/1,100%,,
+Abnormal jaw morphology [HP:0030791],1/1,100%,1/1,100%,,
+Abnormal facial skeleton morphology [HP:0011821],1/1,100%,1/1,100%,,
+Cleft soft palate [HP:0000185],0/0,0%,2/2,100%,,
+Abnormal soft palate morphology [HP:0100736],0/0,0%,2/2,100%,,
+Abnormal palate morphology [HP:0000174],0/0,0%,5/5,100%,,
+Abnormal oral cavity morphology [HP:0000163],1/1,100%,5/5,100%,,
+Abnormal oral morphology [HP:0031816],1/1,100%,5/5,100%,,
+Abnormality of the mouth [HP:0000153],1/1,100%,5/5,100%,,
+Abnormality of the face [HP:0000271],1/1,100%,5/5,100%,,
+Cleft palate [HP:0000175],0/0,0%,2/2,100%,,
+Orofacial cleft [HP:0000202],0/0,0%,2/2,100%,,
+Craniofacial cleft [HP:5201015],0/0,0%,2/2,100%,,
+Aplasia involving forearm bones [HP:0009822],6/6,100%,9/9,100%,,
+Absent forearm bone [HP:0003953],6/6,100%,9/9,100%,,
+Aplasia/hypoplasia of the humerus [HP:0006507],4/4,100%,8/8,100%,,
+Abnormality of the humerus [HP:0003063],4/4,100%,8/8,100%,,
+Abnormality of the upper arm [HP:0001454],4/4,100%,8/8,100%,,
+1-2 finger syndactyly [HP:0010704],1/1,100%,4/4,100%,,
+Finger syndactyly [HP:0006101],2/2,100%,5/5,100%,,
+Syndactyly [HP:0001159],2/2,100%,5/5,100%,,
+Complete atrioventricular canal defect [HP:0001674],3/36,8%,6/67,9%,,
+Atrioventricular canal defect [HP:0006695],3/3,100%,6/6,100%,,
+Upper limb phocomelia [HP:0009813],2/37,5%,8/116,7%,,
+Phocomelia [HP:0009829],2/2,100%,8/8,100%,,
+Abnormality of cardiovascular system electrophysiology [HP:0030956],3/3,100%,18/18,100%,,
+Atrioventricular dissociation [HP:0011709],1/1,100%,0/22,0%,,
+Abnormal morphology of ulna [HP:0040071],4/4,100%,4/4,100%,,
+Aplasia/Hypoplasia of the ulna [HP:0006495],2/2,100%,4/4,100%,,
+Mitral valve prolapse [HP:0001634],1/1,100%,1/1,100%,,
+Abnormal mitral valve morphology [HP:0001633],1/1,100%,1/1,100%,,
+Abnormal atrioventricular valve morphology [HP:0006705],1/1,100%,1/1,100%,,
+Abnormal heart valve morphology [HP:0001654],1/1,100%,1/1,100%,,
+Proximal placement of thumb [HP:0009623],0/0,0%,3/3,100%,,
+Deviation of the thumb [HP:0009603],2/2,100%,3/3,100%,,
+Deviation of finger [HP:0004097],2/2,100%,4/4,100%,,
+Deviation of the hand or of fingers of the hand [HP:0009484],2/2,100%,5/5,100%,,
+Short 1st metacarpal [HP:0010034],0/22,0%,1/45,2%,,
+Short phalanx of the thumb [HP:0009660],0/22,0%,1/45,2%,,
+Sinus bradycardia [HP:0001688],1/1,100%,2/2,100%,,
+Abnormal electrophysiology of sinoatrial node origin [HP:0011702],1/1,100%,2/2,100%,,
+Arrhythmia [HP:0011675],1/1,100%,3/3,100%,,
+Bradycardia [HP:0001662],1/1,100%,2/2,100%,,
+Hypoplasia of deltoid muscle [HP:0030241],0/0,0%,6/6,100%,,
+Shoulder muscle hypoplasia [HP:0008952],0/0,0%,6/6,100%,,
+Hypoplasia of the musculature [HP:0009004],0/0,0%,6/6,100%,,
+Aplasia/Hypoplasia involving the skeletal musculature [HP:0001460],0/0,0%,6/6,100%,,
+Abnormality of muscle size [HP:0030236],0/0,0%,6/6,100%,,
+Abnormal skeletal muscle morphology [HP:0011805],0/0,0%,8/8,100%,,
+Abnormality of the musculature [HP:0003011],0/0,0%,8/8,100%,,
+Aplasia/Hypoplasia involving the shoulder musculature [HP:0001464],0/0,0%,6/6,100%,,
+Aplasia/Hypoplasia involving the musculature of the upper limbs [HP:0001467],0/0,0%,6/6,100%,,
+Aplasia/Hypoplasia involving the musculature of the extremities [HP:0009128],0/0,0%,6/6,100%,,
+Abnormality of the musculature of the limbs [HP:0009127],0/0,0%,8/8,100%,,
+Abnormality of the musculature of the upper limbs [HP:0001446],0/0,0%,8/8,100%,,
+Abnormality of the shoulder girdle musculature [HP:0001435],0/0,0%,6/6,100%,,
+Patent foramen ovale [HP:0001655],0/36,0%,4/69,6%,,
+Synostosis of joints [HP:0100240],1/1,100%,1/1,100%,,
+Abnormality of joint mobility [HP:0011729],5/5,100%,3/3,100%,,
+Abnormal joint physiology [HP:0034430],5/5,100%,3/3,100%,,
+Abnormal musculoskeletal physiology [HP:0011843],5/5,100%,3/3,100%,,
+Abnormality of the vertebral column [HP:0000925],1/1,100%,4/4,100%,,
+Upper extremity joint dislocation [HP:0030310],2/2,100%,0/0,0%,,
+Joint dislocation [HP:0001373],2/2,100%,0/0,0%,,
+Abnormal shoulder morphology [HP:0003043],1/1,100%,1/1,100%,,
+Abnormal thorax morphology [HP:0000765],5/5,100%,7/7,100%,,
+Limited elbow movement [HP:0002996],4/4,100%,2/2,100%,,
+Abnormality of the elbow [HP:0009811],5/5,100%,2/2,100%,,
+Limitation of joint mobility [HP:0001376],4/4,100%,2/2,100%,,
+Mitral regurgitation [HP:0001653],2/2,100%,2/2,100%,,
+Abnormal mitral valve physiology [HP:0031481],2/2,100%,2/2,100%,,
+Pre-capillary pulmonary hypertension [HP:0033578],0/0,0%,8/8,100%,,
+Elevated pulmonary artery pressure [HP:0004890],0/0,0%,8/8,100%,,
+Abnormality of pulmonary circulation [HP:0030875],0/0,0%,8/8,100%,,
+Abnormal vascular physiology [HP:0030163],0/0,0%,8/8,100%,,
+Abnormal respiratory system physiology [HP:0002795],0/0,0%,8/8,100%,,
+Abnormality of the respiratory system [HP:0002086],0/0,0%,8/8,100%,,
+Right atrial enlargement [HP:0030718],0/0,0%,4/4,100%,,
+Abnormal right atrium morphology [HP:0025580],0/0,0%,4/4,100%,,
+Atrial septal dilatation [HP:0011995],0/0,0%,4/4,100%,,
+Pectus excavatum [HP:0000767],2/2,100%,3/4,75%,,
+Abnormal sternum morphology [HP:0000766],2/2,100%,3/3,100%,,
+Postaxial hand polydactyly [HP:0001162],0/0,0%,3/4,75%,,
+Postaxial polydactyly [HP:0100259],0/0,0%,3/3,100%,,
+Polydactyly [HP:0010442],0/0,0%,3/3,100%,,
+Abnormal 5th finger morphology [HP:0004207],0/0,0%,6/6,100%,,
+Hand polydactyly [HP:0001161],0/0,0%,3/3,100%,,
+Duplication of phalanx of hand [HP:0009997],0/0,0%,3/3,100%,,
+Duplication of hand bones [HP:0004275],0/0,0%,3/3,100%,,
+Duplication of bones involving the upper extremities [HP:0009142],0/0,0%,3/3,100%,,
+High palate [HP:0000218],0/0,0%,3/3,100%,,
+Short neck [HP:0000470],0/0,0%,3/3,100%,,
+Abnormal neck morphology [HP:0025668],0/0,0%,3/3,100%,,
+Abnormality of the neck [HP:0000464],0/0,0%,3/3,100%,,
+Abnormality of the cervical spine [HP:0003319],0/0,0%,3/3,100%,,
+Shield chest [HP:0000914],0/0,0%,3/3,100%,,
+Enlarged thorax [HP:0100625],0/0,0%,3/3,100%,,
+Abnormal rib cage morphology [HP:0001547],0/0,0%,5/5,100%,,
+Y-shaped metatarsals [HP:0010567],0/0,0%,3/3,100%,,
+Abnormal metatarsal morphology [HP:0001832],0/0,0%,3/3,100%,,
+Abnormal lower limb bone morphology [HP:0040069],0/0,0%,4/4,100%,,
+Abnormality of the lower limb [HP:0002814],0/0,0%,4/4,100%,,
+Abnormal foot morphology [HP:0001760],0/0,0%,4/4,100%,,
+Common atrium [HP:0011565],0/38,0%,1/115,1%,,
+Unroofed coronary sinus [HP:0031297],0/38,0%,1/117,1%,,
+Limited pronation/supination of forearm [HP:0006394],3/3,100%,2/2,100%,,
+First degree atrioventricular block [HP:0011705],1/1,100%,1/23,4%,,
+Hypoplastic scapulae [HP:0000882],1/1,100%,1/1,100%,,
+Aplasia/Hypoplasia of the scapulae [HP:0006713],1/1,100%,1/1,100%,,
+Aplasia/Hypoplasia involving bones of the thorax [HP:0006711],2/2,100%,2/2,100%,,
+Abnormal scapula morphology [HP:0000782],1/1,100%,1/1,100%,,
+Clinodactyly of the 5th finger [HP:0004209],0/0,0%,2/2,100%,,
+Finger clinodactyly [HP:0040019],0/0,0%,2/2,100%,,
+Clinodactyly [HP:0030084],0/0,0%,2/2,100%,,
+Deviation of the 5th finger [HP:0009179],0/0,0%,2/2,100%,,
+Abnormal 2nd finger morphology [HP:0004100],0/0,0%,2/2,100%,,
+Aplasia/Hypoplasia of the 2nd finger [HP:0006264],0/0,0%,2/2,100%,,
+Short middle phalanx of finger [HP:0005819],0/0,0%,2/2,100%,,
+Aplasia/Hypoplasia of the middle phalanges of the hand [HP:0009843],0/0,0%,2/2,100%,,
+Abnormal middle phalanx morphology of the hand [HP:0009833],0/0,0%,2/2,100%,,
+Short phalanx of finger [HP:0009803],0/0,0%,2/2,100%,,
+Short middle phalanx of the 5th finger [HP:0004220],0/0,0%,2/2,100%,,
+Type A brachydactyly [HP:0009370],0/0,0%,2/2,100%,,
+Brachydactyly [HP:0001156],0/0,0%,2/2,100%,,
+Short 5th finger [HP:0009237],0/0,0%,2/2,100%,,
+Aplasia/Hypoplasia of the 5th finger [HP:0006262],0/0,0%,2/2,100%,,
+Aplasia/Hypoplasia of the middle phalanx of the 5th finger [HP:0009161],0/0,0%,2/2,100%,,
+Aplasia/Hypoplasia of the phalanges of the 5th finger [HP:0009376],0/0,0%,2/2,100%,,
+Abnormal 5th finger phalanx morphology [HP:0004213],0/0,0%,2/2,100%,,
+Abnormality of the middle phalanx of the 5th finger [HP:0004219],0/0,0%,2/2,100%,,
+11 pairs of ribs [HP:0000878],0/0,0%,2/2,100%,,
+Missing ribs [HP:0000921],0/0,0%,2/2,100%,,
+Aplasia/Hypoplasia of the ribs [HP:0006712],0/0,0%,2/2,100%,,
+Abnormal rib morphology [HP:0000772],0/0,0%,2/2,100%,,
+Bowed forearm bones [HP:0003956],1/1,100%,0/0,0%,,
+Bowing of the arm [HP:0006488],1/1,100%,0/0,0%,,
+Bowing of the long bones [HP:0006487],1/1,100%,0/0,0%,,
+Abnormal diaphysis morphology [HP:0000940],1/1,100%,0/0,0%,,
+Abnormal skin morphology [HP:0011121],1/1,100%,0/0,0%,,
+Abnormality of the skin [HP:0000951],1/1,100%,0/0,0%,,
+Abnormality of the integument [HP:0001574],1/1,100%,0/0,0%,,
+Sinus venosus atrial septal defect [HP:0011567],1/1,100%,0/2,0%,,
+Left ventricular noncompaction cardiomyopathy [HP:0011664],1/5,20%,1/7,14%,,
+Noncompaction cardiomyopathy [HP:0012817],1/1,100%,1/1,100%,,
+Cardiomyopathy [HP:0001638],1/1,100%,1/1,100%,,
+Abnormal myocardium morphology [HP:0001637],1/1,100%,1/1,100%,,
+Abnormal toe morphology [HP:0001780],0/0,0%,1/1,100%,,
+Abnormal toe phalanx morphology [HP:0010161],0/0,0%,1/1,100%,,
+Abnormality of the distal phalanges of the toes [HP:0010182],0/0,0%,1/1,100%,,
+Amelia involving the upper limbs [HP:0009812],1/37,3%,0/114,0%,,
+Third degree atrioventricular block [HP:0001709],1/1,100%,0/22,0%,,
+Small hypothenar eminence [HP:0010487],0/0,0%,2/2,100%,,
+Abnormality of the hypothenar eminence [HP:0010486],0/0,0%,2/2,100%,,
+Abnormality of the musculature of the hand [HP:0001421],0/0,0%,2/2,100%,,
diff --git a/docs/user-guide/report/tbx5_frameshift.mtc_report.html b/docs/user-guide/report/tbx5_frameshift.mtc_report.html
new file mode 100644
index 00000000..3d00300a
--- /dev/null
+++ b/docs/user-guide/report/tbx5_frameshift.mtc_report.html
@@ -0,0 +1,141 @@
+
+
+
+
+ Cohort
+
+
+
+
+ Phenotype testing report
+ Phenotype MTC filter: HPO MTC filter
+ Multiple testing correction: fdr_bh
+ Performed statistical tests for 17 out of the total of 260 HPO terms.
+
+ Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.
+
+
+ Code |
+ Reason |
+ Count |
+
+
+
+
+ TODO |
+ Skipping term with only 2 observations (not powered for 2x2) |
+ 51 |
+
+
+
+
+ TODO |
+ Skipping term because all genotypes have same HPO observed proportions |
+ 50 |
+
+
+
+
+ TODO |
+ Skipping general term |
+ 44 |
+
+
+
+
+ TODO |
+ Skipping term with only 3 observations (not powered for 2x2) |
+ 27 |
+
+
+
+
+ TODO |
+ Skipping term with only 6 observations (not powered for 2x2) |
+ 19 |
+
+
+
+
+ TODO |
+ Skipping term with only 4 observations (not powered for 2x2) |
+ 16 |
+
+
+
+
+ TODO |
+ Skipping term with maximum frequency that was less than threshold 0.2 |
+ 10 |
+
+
+
+
+ TODO |
+ Skipping term with only 5 observations (not powered for 2x2) |
+ 10 |
+
+
+
+
+ TODO |
+ Skipping term with only 1 observations (not powered for 2x2) |
+ 7 |
+
+
+
+
+ TODO |
+ Skipping term because one genotype had zero observations |
+ 5 |
+
+
+
+
+ TODO |
+ Skipping term because no genotype has more than one observed HPO count |
+ 4 |
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst
index 054763d5..993dd592 100644
--- a/docs/user-guide/stats.rst
+++ b/docs/user-guide/stats.rst
@@ -4,25 +4,32 @@
Statistical tests
=================
-There are many different ways of statistically testing for genotype-phenotype correlations,
-and the appropriate statistical test depends on the question.
-This document provides an overview of the tests offered by the GPSEA library
+There are many different ways of statistically testing for genotype-phenotype correlations,
+and the appropriate statistical test depends on the question.
+This document provides an overview of the tests offered by the GPSEA library
and explanations of how they are implemented by our software.
+*************************************
+Compare genotype and phenotype groups
+*************************************
+
+TODO
+
+.. _fisher-exact-test:
Fisher exact test (FET)
-~~~~~~~~~~~~~~~~~~~~~~~
+=======================
-The Fisher exact test (FET) calculates the exact probability value
-for the relationship between two dichotomous variables.
+The Fisher exact test (FET) calculates the exact probability value
+for the relationship between two dichotomous variables.
In our implementation, the two dichotomous variables are the genotype and the phenotype.
-For instance, the individuals of the cohort may be divided
-according to whether or not they have a nonsense variant
+For instance, the individuals of the cohort may be divided
+according to whether or not they have a nonsense variant
and according to whether or not they have Strabismus (`HP:0000486 `_).
-The results of FET are expressed in terms of an exact probability (P-value), varying within 0 and 1.
-Two groups are considered statistically significant if the P-value is less
+The results of FET are expressed in terms of an exact probability (P-value), varying within 0 and 1.
+Two groups are considered statistically significant if the P-value is less
than the chosen significance level (usually :math:`\alpha = 0.05`).
The following graphic shows an example contingency table that is used to conduct a Fisher exact test.
@@ -32,12 +39,12 @@ We are comparing the frequency of *strabismus* in individuals with missense and
:alt: Fisher exact text example
:align: center
:width: 600px
-
+
To perform the corresponding test in Python, we would use the following code.
>>> import scipy.stats as stats
>>> contingency_table = [
-... # Missense, Nonsense
+... # Missense, Nonsense
... [17, 6 ], # Strabismus observed
... [1, 19 ], # Strabismus excluded
... ]
@@ -49,27 +56,213 @@ To perform the corresponding test in Python, we would use the following code.
The ``p_value`` evaluates to `5.432292015291845e-06`, meaning there is a significant difference between the groups.
-The Fisher exact test evaluates whether the observed frequencies in a contingency table significantly
+The Fisher exact test evaluates whether the observed frequencies in a contingency table significantly
deviate from the frequencies we would expect if there were no association between the variables.
-We want to test whether the frequency of `HP:0000486`` is significantly higher or lower in
-one genotype group compared to what would be expected if there were no association.
-Note that by default, the *two-tailed* Fisher exact test is performed, meaning we have no
-hypothesis as to whether there is a higher or lower frequency in one of the genotype groups.
+We want to test whether the frequency of `HP:0000486`` is significantly higher or lower in
+one genotype group compared to what would be expected if there were no association.
+Note that by default, the *two-tailed* Fisher exact test is performed, meaning we have no
+hypothesis as to whether there is a higher or lower frequency in one of the genotype groups.
+
+However, we are typically interested in testing the associations between the genotype and multiple phenotypic features at once.
+GPSEA takes advantage of the HPO structure and simplifies the testing for all HPO terms encoded in the cohort.
+
+Example
+-------
+
+Let's illustrate this in a real-life example of the analysis of the association between frameshift variants in *TBX5* gene
+and congenital heart defects in the dataset of 156 individuals with mutations in *TBX5* whose signs and symptoms were
+encoded into HPO terms, stored as phenopackets of the `GA4GH Phenopacket Schema `_,
+and deposited in `Phenopacket Store `_ (version `0.1.18`).
+
+.. note::
+
+ The shorter version of the same analysis has been presented in the :ref:`tutorial`.
+
+
+Create cohort
+^^^^^^^^^^^^^
+
+We will load and transform the phenopackets into a :class:`~gpsea.model.Cohort`,
+as described in :ref:`input-data` section. Briefly, we will load the phenopackets:
+
+>>> from ppktstore.registry import configure_phenopacket_registry
+>>> registry = configure_phenopacket_registry()
+>>> with registry.open_phenopacket_store(release='0.1.18') as ps:
+... phenopackets = tuple(ps.iter_cohort_phenopackets('TBX5'))
+>>> len(phenopackets)
+156
+
+followed by loading HPO release `v2024-07-01`:
+
+>>> import hpotk
+>>> store = hpotk.configure_ontology_store()
+>>> hpo = store.load_minimal_hpo(release='v2024-07-01')
+
+and we will perform Q/C and functional annotations for the mutations
+with the default cohort creator:
+
+>>> from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets
+>>> cohort_creator = configure_caching_cohort_creator(hpo)
+>>> cohort, qc_results = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+Patients Created: ...
+>>> qc_results.summarize() # doctest: +SKIP
+Validated under none policy
+No errors or warnings were found
+
+Configure analysis
+^^^^^^^^^^^^^^^^^^
+
+We want to test the association between frameshift *TBX5* variants and phenotypic abnormalities.
+GPSEA exposes a flexible predicate API that lets us create genotype and phenotype predicates
+to assign the cohort members into genotype and phenotype categories based on the variants
+and the HPO terms. We need to create one genotype predicate and one or more phenotype predicates.
+
+
+**Genotype predicate**
+
+We want to separate the patients into two groups: a group *with* a frameshift variant
+and a group *without* a frameshift variant, based on the functional annotation.
+We will use the *MANE* transcript for the analysis:
+
+>>> from gpsea.model import VariantEffect
+>>> from gpsea.analysis.predicate.genotype import VariantPredicates, boolean_predicate
+>>> tx_id = 'NM_181486.4'
+>>> gt_predicate = boolean_predicate(VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id))
+>>> gt_predicate.get_question()
+'FRAMESHIFT_VARIANT on NM_181486.4'
+
+
+**Phenotype predicates**
+
+We recommend testing the genotype phenotype association for all HPO terms that are present in 2 or more cohort members,
+while taking advantage of the HPO graph structure and of the :ref:`true-path-rule`.
+We will use the :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest`
+utility function to generate phenotype predicates for all HPO terms:
+
+>>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest
+>>> pheno_predicates = prepare_predicates_for_terms_of_interest(
+... cohort=cohort,
+... hpo=hpo,
+... min_n_of_patients_with_term=2,
+... )
+>>> len(pheno_predicates)
+260
+
+The function finds all HPO terms that annotate at least *n* (``min_n_of_patients_with_term=2`` above) individuals,
+including the *indirect* annotations whose presence is implied by the true path rule.
+
+
+**Statistical test**
+
+We will use :ref: to test the association
+between genotype and phenotype groups, as described previously.
+
+>>> from gpsea.analysis.pcats.stats import ScipyFisherExact
+>>> count_statistic = ScipyFisherExact()
+
+FET will compute a p value for each genotype phenotype group.
+
+
+**Multiple testing correction**
+
+In the case of this cohort, we could test association between having a frameshift variant and one of 260 HPO terms.
+However, testing multiple hypotheses on the same dataset increases the risk of finding a significant association
+by chance.
+GPSEA uses a two-pronged strategy to mitigate this risk - use Phenotype MTC filter and multiple testing correction.
+
+.. note::
+
+ See the :ref:`mtc` section for more info on multiple testing procedures.
+
+Here we will use a combination of the HPO MTC filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`)
+with Benjamini-Hochberg procedure (``mtc_correction='fdr_bh'``)
+with a false discovery control level set to `0.05` (``mtc_alpha=0.05``):
+
+>>> from gpsea.analysis.mtc_filter import HpoMtcFilter
+>>> mtc_filter = HpoMtcFilter.default_filter(hpo, term_frequency_threshold=0.2)
+>>> mtc_correction = 'fdr_bh'
+>>> mtc_alpha = 0.05
+
+
+**Final analysis**
+
+We finalize the analysis setup by putting all components together
+into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`:
+
+>>> from gpsea.analysis.pcats import HpoTermAnalysis
+>>> analysis = HpoTermAnalysis(
+... count_statistic=count_statistic,
+... mtc_filter=mtc_filter,
+... mtc_correction=mtc_correction,
+... mtc_alpha=mtc_alpha,
+... )
+
+
+Analysis
+^^^^^^^^
+
+We can now execute the analysis:
+
+>>> result = analysis.compare_genotype_vs_phenotypes(
+... cohort=cohort,
+... gt_predicate=gt_predicate,
+... pheno_predicates=pheno_predicates,
+... )
+>>> len(result.phenotypes)
+260
+>>> result.total_tests
+17
+
+
+Thanks to Phenotype MTC filter, we only tested 16 out of 260 terms.
+We can learn more by showing the MTC filter report:
+
+>>> from gpsea.view import MtcStatsViewer
+>>> mtc_viewer = MtcStatsViewer()
+>>> mtc_report = mtc_viewer.process(result)
+>>> with open('docs/user-guide/report/tbx5_frameshift.mtc_report.html', 'w') as fh: # doctest: +SKIP
+... _ = fh.write(mtc_report)
+
+
+.. raw:: html
+ :file: report/tbx5_frameshift.mtc_report.html
+
+
+Genotype phenotype associations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Last, let's explore the associations. This is a table of the tested HPO terms
+ordered by the corrected p value (Benjamini-Hochberg FDR):
+
+>>> from gpsea.analysis.predicate import PatientCategories
+>>> summary_df = result.summarize(hpo, PatientCategories.YES)
+>>> summary_df.to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP
+
+.. csv-table:: *TBX5* frameshift vs rest
+ :file: report/tbx5_frameshift.csv
+ :header-rows: 2
.. _phenotype-score-stats:
-Mann-Whitney U Test
-~~~~~~~~~~~~~~~~~~~
+***************
+Phenotype score
+***************
+
+
+.. _mann-whitney-u-test:
+
+Mann-Whitney U Test
+===================
We may want to compare the total number of occurences of a specific set of phenotypic features between two different genotype groups.
-For instance, `Jordan et al (2018) `_ found that the total number of structural defects
+For instance, `Jordan et al (2018) `_ found that the total number of structural defects
of the brain, eye, heart, and kidney and sensorineural hearing loss seen in individuals with point mutations in the Atrophin-1 domain of the RERE gene
is significantly higher than expected based on the number of similar defects seen in individuals with putative loss-of-function variants.
-Since there are five potential defects, each individual has a count ranging between 0 and 5.
+Since there are five potential defects, each individual has a count ranging between 0 and 5.
We perform a Mann-Whitney U Test (or Wilcoxon Rank-Sum Test) to compare the distribution of such counts between genotype groups.
-This is a non-parametric test that compares the medians of the two groups to determine if they come from the same distribution.
+This is a non-parametric test that compares the medians of the two groups to determine if they come from the same distribution.
>>> import scipy.stats as stats
>>> group1 = [0, 0, 1, 0, 2, 0, 1, 1, 1, 0, 2, 0, 0, 3, 1, 1, 1, 0]
@@ -79,14 +272,20 @@ This is a non-parametric test that compares the medians of the two groups to det
>>> float(p_value)
6.348081479150902e-06
+
``p_value`` evaluates to `6.348081479150901e-06`, meaning there is a significant difference between the groups.
Example
-^^^^^^^
+-------
+
+Let's now analyze the subjects reported in *Jordan et al*.
+We will load 19 phenopackets that represent individuals with mutations in *RERE*
+whose signs and symptoms were encoded into HPO terms and deposited into Phenopacket Store.
+The phenopackets will be processed into :class:`~gpsea.model.Cohort`
+as described in the :ref:`input-data` section.
-Let's now analyze the subjects reported in *Jordan et al*.
-We start by loading the cohort from Phenopacket Store (version `0.1.18`):
+Briefly, we will first load 19 phenopackets
>>> from ppktstore.registry import configure_phenopacket_registry
>>> registry = configure_phenopacket_registry()
@@ -95,37 +294,55 @@ We start by loading the cohort from Phenopacket Store (version `0.1.18`):
>>> len(phenopackets)
19
-We loaded 19 phenopackets.
-Now, we need to prepare the phenopackets for using with GPSEA.
-We will need HPO (version `v2024-07-01`)
+and load HPO (version `v2024-07-01`)
>>> import hpotk
>>> store = hpotk.configure_ontology_store()
>>> hpo = store.load_minimal_hpo(release='v2024-07-01')
-to create cohort creator
+
+to create a :class:`~gpsea.preprocessing.CohortCreator`
>>> from gpsea.preprocessing import configure_caching_cohort_creator
>>> cohort_creator = configure_caching_cohort_creator(hpo)
+
which we will use to preprocess the cohort
>>> from gpsea.preprocessing import load_phenopackets
>>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
Patients Created: ...
+
+
+resulting in a cohort consisting of 19 individuals
+
>>> len(cohort)
19
-Now we can set up the phenotype and genotype predicates. Jordan et al tests ...
-.. todo: improve the text
+Configure analysis
+^^^^^^^^^^^^^^^^^^
+
+Now we can set up the analysis of genotype and phenotype.
+We will perform the analysis using the *RERE* transcript selected
+as the "main" biologically relevant by the `MANE` consortium.
->>> rere_mane_tx_id = 'NM_001042681.2'
+>>> tx_id = 'NM_001042681.2'
-Now let's create a predicate for testing if the variant is a point mutation or a loss of function mutation.
-The point mutation predicate is defined as ...
-TODO: improve!
+
+**Genotype predicate**
+
+*Jordan et al.* compare phenotype of individuals harboring point mutations
+with the individuals carrying loss of function mutations.
+Let's create a predicate for testing if the variant
+is a point mutation or a loss of function mutation.
+
+In this example, the point mutation is a mutation that meets the following conditions:
+
+* predicted to lead to a missense variant on the `MANE` transcript
+* the :ref:`length-of-the-reference-allele` is equal to `1`
+* the :ref:`change-length-of-an-allele` is equal to `0`
>>> from gpsea.model import VariantEffect
>>> from gpsea.analysis.predicate.genotype import VariantPredicates
@@ -134,11 +351,12 @@ TODO: improve!
... )
>>> point_mutation = VariantPredicates.change_length('==', 0) \
... & VariantPredicates.ref_length('==', 1) \
-... & VariantPredicates.any(VariantPredicates.variant_effect(effect, rere_mane_tx_id) for effect in point_mutation_effects)
+... & VariantPredicates.any(VariantPredicates.variant_effect(effect, tx_id) for effect in point_mutation_effects)
>>> point_mutation.get_question()
'((change length == 0 AND ref allele length == 1) AND MISSENSE_VARIANT on NM_001042681.2)'
-For the loss of function predicate, these variant effects are considered loss of function:
+
+For the loss of function predicate, the following variant effects are considered loss of function:
>>> lof_effects = (
... VariantEffect.TRANSCRIPT_ABLATION,
@@ -146,10 +364,11 @@ For the loss of function predicate, these variant effects are considered loss of
... VariantEffect.START_LOST,
... VariantEffect.STOP_GAINED,
... )
->>> lof_mutation = VariantPredicates.any(VariantPredicates.variant_effect(eff, rere_mane_tx_id) for eff in lof_effects)
+>>> lof_mutation = VariantPredicates.any(VariantPredicates.variant_effect(eff, tx_id) for eff in lof_effects)
>>> lof_mutation.get_question()
'(TRANSCRIPT_ABLATION on NM_001042681.2 OR FRAMESHIFT_VARIANT on NM_001042681.2 OR START_LOST on NM_001042681.2 OR STOP_GAINED on NM_001042681.2)'
+
The genotype predicate will bin the patient into two groups: a point mutation group or the loss of function group:
>>> from gpsea.analysis.predicate.genotype import groups_predicate
@@ -160,8 +379,24 @@ The genotype predicate will bin the patient into two groups: a point mutation gr
>>> gt_predicate.get_question()
'Genotype group: Point, LoF'
-Now phenotype predicate. The authors divide the patients into groups according to the count of structural defects
-in these groups:
+
+**Phenotype score**
+
+The authors score the individuals based on the number of structural defects
+from the following 5 categories:
+
+* Brain anomalies
+* Eye anomalies
+* Congenital heart defects
+* Renal anomalies
+* Sensorineural hearing loss
+
+and they assign each individual a score based on the number structural defects.
+For example, an individual with a congenital heart defect would be assigned a score of `1`,
+an individual with congenital heart defect and a renal anomaly would be assigned a score of `2`,
+and so on.
+
+We automatize this scoring method by encoding the categories into HPO terms
>>> structural_defects = (
... 'HP:0012443', # Abnormal brain morphology (Brain anomalies)
@@ -171,58 +406,109 @@ in these groups:
... 'HP:0000407', # Sensorineural hearing impairment (Sensorineural hearing loss)
... )
-Let's run the analysis.
->>> from gpsea.analysis import configure_cohort_analysis
->>> analysis = configure_cohort_analysis(
-... cohort, hpo,
+and then test the individuals for presence of at least one HPO term
+that corresponds to the structural defect category
+(e.g. `Abnormal brain morphology `_)
+or that is its descendant
+(e.g. `Cerebellar atrophy `_).
+
+GPSEA implements this scoring method in :class:`~gpsea.analysis.pscore.CountingPhenotypeScorer`.
+
+>>> from gpsea.analysis.pscore import CountingPhenotypeScorer
+>>> pheno_scorer = CountingPhenotypeScorer.from_query_curies(
+... hpo=hpo,
+... query=structural_defects,
... )
->>> result = analysis.compare_genotype_vs_phenotype_group_count(
-... gt_predicate=gt_predicate,
-... phenotype_group_terms=structural_defects,
+
+
+**Statistical test**
+
+We will use :ref:`mann-whitney-u-test` as described above.
+
+>>> from gpsea.analysis.pscore.stats import MannWhitneyStatistic
+>>> score_statistic = MannWhitneyStatistic()
+
+
+**Final analysis**
+
+We will put the final analysis together into :class:`~gpsea.analysis.pscore.PhenotypeScoreAnalysis`.
+
+>>> from gpsea.analysis.pscore import PhenotypeScoreAnalysis
+>>> score_analysis = PhenotypeScoreAnalysis(
+... score_statistic=score_statistic,
... )
->>> round(result.p_value, 9)
-0.027066902
-We have the counts:
+Analysis
+^^^^^^^^
+
+We execute the analysis by running
->>> counts = result.genotype_phenotype_scores
->>> counts.head() # doctest: +NORMALIZE_WHITESPACE
+>>> result = score_analysis.compare_genotype_vs_phenotype_score(
+... cohort=cohort,
+... gt_predicate=gt_predicate,
+... pheno_scorer=pheno_scorer,
+... )
+
+
+The analysis shows a significant difference between the number of structural defects
+in individuals with point vs. loss-of-function mutations.
+
+>>> result.pval
+0.012074957610483744
+
+
+To explore further, we can access a data frame with genotype categories and phenotype counts:
+
+>>> scores = result.genotype_phenotype_scores.sort_index()
+>>> scores.head() # doctest: +NORMALIZE_WHITESPACE
genotype phenotype
-patient_id
+patient_id
Subject 10[PMID_27087320_Subject_10] 1 0
Subject 1[PMID_27087320_Subject_1] 0 4
+Subject 1[PMID_29330883_Subject_1] 1 0
Subject 2[PMID_27087320_Subject_2] None 4
Subject 2[PMID_29330883_Subject_2] 1 1
-Subject 3[PMID_27087320_Subject_3] 0 4
-The data frame provides a genotype category and a phenotype score for each patient.
+
+The data frame provides a `genotype` category and a `phenotype` score for each patient.
The genotype category should be interpreted in the context of the genotype predicate:
>>> gt_id_to_name = {c.category.cat_id: c.category.name for c in gt_predicate.get_categorizations()}
>>> gt_id_to_name
{0: 'Point', 1: 'LoF'}
-Let's plot the data:
+
+The genotype code `0` is assigned to patients with a point mutation, `1` corresponds to the loss-of-function mutations,
+and `None` is assigned to patients who cannot be assigned into any of the groups.
+
+Last, let's use :meth:`~gpsea.analysis.pscore.PhenotypeScoreAnalysisResult.plot_boxplots` method
+to show a box plot of the phenotype score distributions:
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(figsize=(6, 4), dpi=120)
->>> data = counts.loc[counts['genotype'].notna()] # skip the patients with unassigned genotype group
->>> x = [data.loc[data['genotype'] == c.category.cat_id, 'phenotype'].to_list() for c in gt_predicate.get_categorizations()]
->>> gt_cat_labels = [gt_id_to_name[c.category.cat_id] for c in gt_predicate.get_categorizations()]
->>> bplot = ax.boxplot(
-... x=x,
-... patch_artist=True, tick_labels=gt_cat_labels,
+>>> result.plot_boxplots(
+... gt_predicate=gt_predicate,
+... ax=ax,
+... )
+>>> _ = ax.grid(axis="y")
+>>> _ = ax.set(
+... ylabel="Phenotype score", ylim=(-0.5, len(structural_defects) + 0.5)
... )
->>> _ = ax.grid(axis='y')
->>> _ = ax.set(ylabel='Phenotype group count', ylim=(-.5, len(structural_defects) + .5))
->>> for patch, color in zip(bplot['boxes'], ['darksalmon', 'honeydew']):
-... patch.set_facecolor(color)
->>> fig.savefig('docs/img/phenotype_group_counts.png') # doctest: +SKIP
+>>> fig.savefig('docs/img/rere_phenotype_score_boxplot.png') # doctest: +SKIP
-.. image:: /img/phenotype_group_counts.png
- :alt: Phenotype group counts
+.. image:: /img/rere_phenotype_score_boxplot.png
+ :alt: Phenotype score distribution
:align: center
:width: 600px
+
+
+We see that the individuals with the point mutations feature structural defects
+than the individuals with the loss-of-function mutations.
+
+The box extends from the first quartile (Q1) to the third quartile (Q3) of the data,
+with a red line at the median.
+The whiskers extend from the box to the farthest data point
+lying within 1.5x the inter-quartile range (IQR) from the box.
diff --git a/src/gpsea/analysis/__init__.py b/src/gpsea/analysis/__init__.py
index 0b4f157b..1adf3c2e 100644
--- a/src/gpsea/analysis/__init__.py
+++ b/src/gpsea/analysis/__init__.py
@@ -1,9 +1,7 @@
-from . import predicate
-
from ._api import CohortAnalysis, GenotypePhenotypeAnalysisResult, HpoMtcReport
+# TODO This should go away
from ._config import CohortAnalysisConfiguration, configure_cohort_analysis, configure_default_protein_metadata_service, MtcStrategy
from ._gp_analysis import apply_predicates_on_patients
-from ._mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter
from ._util import prepare_hpo_terms_of_interest
__all__ = [
@@ -11,7 +9,6 @@
'CohortAnalysis', 'GenotypePhenotypeAnalysisResult',
'CohortAnalysisConfiguration', 'MtcStrategy',
'HpoMtcReport',
- 'PhenotypeMtcFilter', 'UseAllTermsMtcFilter', 'SpecifiedTermsMtcFilter', 'HpoMtcFilter',
'apply_predicates_on_patients',
'configure_default_protein_metadata_service',
'prepare_hpo_terms_of_interest',
diff --git a/src/gpsea/analysis/_api.py b/src/gpsea/analysis/_api.py
index 5555e225..ca7bd8e4 100644
--- a/src/gpsea/analysis/_api.py
+++ b/src/gpsea/analysis/_api.py
@@ -7,9 +7,10 @@
from gpsea.model import Patient
from gpsea.preprocessing import ProteinMetadataService
-from .predicate import PolyPredicate, GenotypePolyPredicate, PatientCategory
-from .predicate.genotype import VariantPredicate, ProteinPredicates
-from .predicate.phenotype import P, PhenotypePolyPredicate, CountingPhenotypeScorer
+from .predicate import PolyPredicate, PatientCategory
+from .predicate.genotype import GenotypePolyPredicate, VariantPredicate, ProteinPredicates
+from .predicate.phenotype import P, PhenotypePolyPredicate
+from .pscore import PhenotypeScorer, CountingPhenotypeScorer
PatientsByHPO = namedtuple('PatientsByHPO', field_names=['all_with_hpo', 'all_without_hpo'])
@@ -18,6 +19,7 @@ class HpoMtcReport:
"""
Class to simplify reporting results of multiple testing filtering by HpoMtcFilter subclasses.
"""
+ # TODO: delete with no replacement.
def __init__(
self,
@@ -74,6 +76,7 @@ class GenotypePhenotypeAnalysisResult:
"""
`GenotypePhenotypeAnalysisResult` summarizes results of genotype-phenotype correlation analysis of a cohort.
"""
+ # TODO: delete and use `gpsea.analysis.pcats.MultiPhenotypeAnalysisResult`.
def __init__(
self,
@@ -142,7 +145,7 @@ def phenotype_categories(self) -> typing.Sequence[PatientCategory]:
Get a sequence of phenotype patient categories that can be investigated.
"""
return self._phenotype_categories
-
+
@property
def total_tests(self) -> int:
"""
@@ -239,6 +242,7 @@ class PhenotypeScoreAnalysisResult:
See :ref:`Mann Whitney U Test for phenotype score ` for more background.
"""
+ # TODO: delete and use `gpsea.analysis.pscore.PhenotypeScoreAnalysisResult`
def __init__(
self,
@@ -292,6 +296,7 @@ class CohortAnalysis(metaclass=abc.ABCMeta):
The class provides various methods to test genotype-phenotype correlations. All methods wrap results
into :class:`GenotypePhenotypeAnalysisResult`.
"""
+ # TODO: remove and use the analyses described in `User Guide > Statistical tests`.
def __init__(
self,
@@ -308,7 +313,7 @@ def compare_hpo_vs_genotype(
predicate: VariantPredicate,
) -> GenotypePhenotypeAnalysisResult:
"""
- Bin patients according to a presence of at least one allele that matches `predicate`
+ Bin patients according to a presence of at least one allele that matches `predicate`
and test for genotype-phenotype correlations.
"""
pass
@@ -352,6 +357,7 @@ def compare_genotype_vs_phenotype_group_count(
gt_predicate: GenotypePolyPredicate,
phenotype_group_terms: typing.Iterable[typing.Union[str, hpotk.TermId]],
) -> PhenotypeScoreAnalysisResult:
+ # TODO: separate into pscore module
assert isinstance(gt_predicate, GenotypePolyPredicate)
assert gt_predicate.n_categorizations() == 2
@@ -369,7 +375,7 @@ def compare_genotype_vs_phenotype_group_count(
def compare_genotype_vs_phenotype_score(
self,
gt_predicate: GenotypePolyPredicate,
- phenotype_scorer: typing.Callable[[Patient,], float],
+ phenotype_scorer: PhenotypeScorer,
) -> PhenotypeScoreAnalysisResult:
"""
Score the patients with a phenotype scoring method and test for correlation between the genotype group
@@ -386,8 +392,6 @@ def compare_genotype_vs_cohort_phenotypes(
self,
gt_predicate: GenotypePolyPredicate,
) -> GenotypePhenotypeAnalysisResult:
- # TODO: if we had access to the cohort, it would be trivial to prepare
- # all phenotype predicates and to call `self.compare_genotype_vs_phenotypes`.
pass
@abc.abstractmethod
diff --git a/src/gpsea/analysis/_config.py b/src/gpsea/analysis/_config.py
index cbbed148..3fb67fc7 100644
--- a/src/gpsea/analysis/_config.py
+++ b/src/gpsea/analysis/_config.py
@@ -8,8 +8,8 @@
from gpsea.model import Cohort
from gpsea.preprocessing import ProteinMetadataService, UniprotProteinMetadataService, ProteinAnnotationCache, \
ProtCachingMetadataService
+from .mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter
from ._api import CohortAnalysis
-from ._mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter
from ._gp_analysis import FisherExactAnalyzer
from ._gp_impl import GpCohortAnalysis
diff --git a/src/gpsea/analysis/_gp_analysis.py b/src/gpsea/analysis/_gp_analysis.py
index 63a43f18..35d14305 100644
--- a/src/gpsea/analysis/_gp_analysis.py
+++ b/src/gpsea/analysis/_gp_analysis.py
@@ -6,11 +6,12 @@
from statsmodels.stats import multitest
from gpsea.model import Patient
+from .mtc_filter import PhenotypeMtcFilter
+from .predicate import PatientCategory
+from .predicate.genotype import GenotypePolyPredicate
+from .predicate.phenotype import PhenotypePolyPredicate, P
from ._api import GenotypePhenotypeAnalysisResult, HpoMtcReport
-from ._mtc_filter import PhenotypeMtcFilter
from ._stats import run_fisher_exact, run_recessive_fisher_exact
-from .predicate import GenotypePolyPredicate, PatientCategory
-from .predicate.phenotype import PhenotypePolyPredicate, P
def apply_predicates_on_patients(
@@ -44,33 +45,31 @@ def apply_predicates_on_patients(
- a mapping from phenotype :class:`P` to a data frame with counts of patients
in i-th phenotype category and j-th genotype category where i and j are rows and columns of the data frame
"""
+ # TODO: delete with no replacement.
phenotypes = set()
categories = set()
for predicate in pheno_predicates:
categories.update(predicate.get_categories())
- phenotypes.update(predicate.phenotypes)
+ phenotypes.add(predicate.phenotype)
n_usable_patients = pd.Series(data=0, index=pd.Index(phenotypes))
# Apply genotype and phenotype predicates
counts = {}
for ph_predicate in pheno_predicates:
- phenotypes = ph_predicate.phenotypes
-
- for phenotype in phenotypes:
- if phenotype not in counts:
- # Make an empty frame for keeping track of the counts.
- counts[phenotype] = pd.DataFrame(
- data=0,
- index=pd.Index(
- data=ph_predicate.get_categories(),
- name=ph_predicate.get_question(),
- ),
- columns=pd.Index(
- data=gt_predicate.get_categories(),
- name=gt_predicate.get_question(),
- ),
- )
+ if ph_predicate.phenotype not in counts:
+ # Make an empty frame for keeping track of the counts.
+ counts[ph_predicate.phenotype] = pd.DataFrame(
+ data=0,
+ index=pd.Index(
+ data=ph_predicate.get_categories(),
+ name=ph_predicate.get_question(),
+ ),
+ columns=pd.Index(
+ data=gt_predicate.get_categories(),
+ name=gt_predicate.get_question(),
+ ),
+ )
for patient in patients:
pheno_cat = ph_predicate.test(patient)
@@ -87,6 +86,7 @@ class GPAnalyzer(typing.Generic[P], metaclass=abc.ABCMeta):
"""
`GPAnalyzer` calculates p values for genotype-phenotype correlation of phenotypic features of interest.
"""
+ # TODO: delete with no replacement.
@abc.abstractmethod
def analyze(
@@ -115,6 +115,7 @@ class FisherExactAnalyzer(typing.Generic[P], GPAnalyzer[P]):
should be applied.
:param mtc_alpha: a `float` in range :math:`(0, 1]` with the multiple testing correction alpha value.
"""
+ # TODO: delete and use `gpsea.analysis.pcats.MultiPhenotypeAnalysis`.
def __init__(
self,
diff --git a/src/gpsea/analysis/_gp_impl.py b/src/gpsea/analysis/_gp_impl.py
index a35c6b96..c6251f8b 100644
--- a/src/gpsea/analysis/_gp_impl.py
+++ b/src/gpsea/analysis/_gp_impl.py
@@ -6,10 +6,10 @@
from scipy.stats import mannwhitneyu
-from gpsea.model import Cohort, Patient
+from gpsea.model import Cohort
from gpsea.preprocessing import ProteinMetadataService
-from .predicate import GenotypePolyPredicate
-from .predicate.genotype import VariantPredicate
+from .pscore import PhenotypeScorer
+from .predicate.genotype import GenotypePolyPredicate, VariantPredicate
from .predicate.genotype import boolean_predicate as wrap_as_boolean_predicate
from .predicate.genotype import groups_predicate as wrap_as_groups_predicate
from .predicate.genotype import recessive_predicate as wrap_as_recessive_predicate
@@ -131,7 +131,7 @@ def _prepare_disease_predicates(
def compare_genotype_vs_phenotype_score(
self,
gt_predicate: GenotypePolyPredicate,
- phenotype_scorer: typing.Callable[[Patient,], float],
+ phenotype_scorer: PhenotypeScorer,
) -> PhenotypeScoreAnalysisResult:
idx = pd.Index(tuple(p.patient_id for p in self._patient_list), name='patient_id')
data = pd.DataFrame(
@@ -144,7 +144,7 @@ def compare_genotype_vs_phenotype_score(
for patient in self._patient_list:
gt_cat = gt_predicate.test(patient)
data.loc[patient.patient_id, 'genotype'] = None if gt_cat is None else gt_cat.category.cat_id
- data.loc[patient.patient_id, 'phenotype'] = phenotype_scorer(patient)
+ data.loc[patient.patient_id, 'phenotype'] = phenotype_scorer.score(patient)
# To improve the determinism
data.sort_index(inplace=True)
diff --git a/src/gpsea/analysis/_inheritance.py b/src/gpsea/analysis/_inheritance.py
deleted file mode 100644
index 5c0c4236..00000000
--- a/src/gpsea/analysis/_inheritance.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import abc
-import typing
-
-from gpsea.model import Variant
-
-
-class InheritanceCompatibilityChecker(metaclass=abc.ABCMeta):
-
- @abc.abstractmethod
- def check_compatibility(self, patient: typing.Sequence[Variant]) -> bool:
- pass
-
-
-class AutosomalDominantChecker(InheritanceCompatibilityChecker):
-
- def check_compatibility(self, patient: typing.Sequence[Variant]) -> bool:
- pass
-
-
-class AutosomalRecessiveChecker(InheritanceCompatibilityChecker):
-
- def check_compatibility(self, patient: typing.Sequence[Variant]) -> bool:
- pass
-
-
-class MitochondrialChecker(InheritanceCompatibilityChecker):
-
- def check_compatibility(self, patient: typing.Sequence[Variant]) -> bool:
- pass
diff --git a/src/gpsea/analysis/_stats.py b/src/gpsea/analysis/_stats.py
index f2a480b9..1364e7d5 100644
--- a/src/gpsea/analysis/_stats.py
+++ b/src/gpsea/analysis/_stats.py
@@ -6,6 +6,8 @@
import numpy as np
import scipy
+# TODO: delete the module
+
class MultiFisherExact(metaclass=abc.ABCMeta):
diff --git a/src/gpsea/analysis/_test_fisherExact.py b/src/gpsea/analysis/_test_fisherExact.py
index aba6ff26..fd530f5c 100644
--- a/src/gpsea/analysis/_test_fisherExact.py
+++ b/src/gpsea/analysis/_test_fisherExact.py
@@ -8,6 +8,8 @@
def MultiExact() -> PythonMultiFisherExact:
return PythonMultiFisherExact()
+# TODO: remove
+
@pytest.mark.parametrize('table, raise_error, pVal',
([[[0,0,0],[0,0,0]], pytest.raises(ValueError), None],
[[[2, 1, 0],[3, 0, 2]], does_not_raise(), 0.6429],
diff --git a/src/gpsea/analysis/_util.py b/src/gpsea/analysis/_util.py
index 1e0bdc77..2c3445b5 100644
--- a/src/gpsea/analysis/_util.py
+++ b/src/gpsea/analysis/_util.py
@@ -24,6 +24,7 @@ def prepare_hpo_terms_of_interest(
:param min_n_of_patients_with_term: the minimum number of patients that must feature an HPO term
(either directly or indirectly) for the term to be included in the analysis.
"""
+ # TODO remove in favor of `gpsea.analysis.predicate.phenotype`
present_count = Counter()
excluded_count = Counter()
diff --git a/src/gpsea/analysis/mtc_filter/__init__.py b/src/gpsea/analysis/mtc_filter/__init__.py
new file mode 100644
index 00000000..4497ac34
--- /dev/null
+++ b/src/gpsea/analysis/mtc_filter/__init__.py
@@ -0,0 +1,14 @@
+"""
+The `gpsea.analysis.mtc_filter` provides the strategies for reducing multiple testing burden
+by pre-filtering the phenotype terms to test in advance.
+
+See :ref:`MTC filters ` section for more info.
+"""
+
+from ._impl import PhenotypeMtcFilter, PhenotypeMtcResult
+from ._impl import UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter
+
+__all__ = [
+ 'PhenotypeMtcFilter', 'PhenotypeMtcResult',
+ 'UseAllTermsMtcFilter', 'SpecifiedTermsMtcFilter', 'HpoMtcFilter',
+]
diff --git a/src/gpsea/analysis/_mtc_filter.py b/src/gpsea/analysis/mtc_filter/_impl.py
similarity index 68%
rename from src/gpsea/analysis/_mtc_filter.py
rename to src/gpsea/analysis/mtc_filter/_impl.py
index 2025c0ed..b15e4bc6 100644
--- a/src/gpsea/analysis/_mtc_filter.py
+++ b/src/gpsea/analysis/mtc_filter/_impl.py
@@ -6,10 +6,62 @@
import hpotk
import pandas as pd
-from .predicate import PatientCategories, PatientCategory, GenotypePolyPredicate
+from ..predicate import PatientCategories, PatientCategory
+from ..predicate.genotype import GenotypePolyPredicate
+from ..predicate.phenotype import P
-class PhenotypeMtcFilter(metaclass=abc.ABCMeta):
+class PhenotypeMtcResult:
+ """
+ `PhenotypeMtcResult` represents a result of :class:`PhenotypeMtcFilter` for a single phenotype.
+
+ The phenotype can either pass the filter, in order to be included in the downstream analysis (:meth:`is_passed`)
+ of be filtered out (:meth:`is_filtered_out`) in which case :attr:`reason` must be available.
+ """
+
+ @staticmethod
+ def ok() -> "PhenotypeMtcResult":
+ # A singleton would be nice here...
+ return PhenotypeMtcResult(status=True, reason=None)
+
+ @staticmethod
+ def fail(reason: str) -> "PhenotypeMtcResult":
+ return PhenotypeMtcResult(status=False, reason=reason)
+
+ def __init__(
+ self,
+ status: bool,
+ reason: typing.Optional[str],
+ ):
+ self._status = status
+ self._reason = reason
+
+ def is_passed(self) -> bool:
+ return self._status
+
+ def is_filtered_out(self) -> bool:
+ return not self._status
+
+ @property
+ def reason(self) -> typing.Optional[str]:
+ return self._reason
+
+ def __eq__(self, value: object) -> bool:
+ return isinstance(value, PhenotypeMtcResult) \
+ and self._status == value._status \
+ and self._reason == value._reason
+
+ def __hash__(self) -> int:
+ return hash((self._status, self._reason))
+
+ def __str__(self) -> str:
+ return f'PhenotypeMtcResult(status={self._status}, reason={self._reason})'
+
+ def __repr__(self) -> str:
+ return str(self)
+
+
+class PhenotypeMtcFilter(typing.Generic[P], metaclass=abc.ABCMeta):
"""
`PhenotypeMtcFilter` decides which phenotypes should be tested and which phenotypes
are not worth testing in order to reduce the multiple testing burden.
@@ -20,6 +72,24 @@ class PhenotypeMtcFilter(metaclass=abc.ABCMeta):
categorized according to the HPO term.
"""
+ @abc.abstractmethod
+ def filter(
+ self,
+ gt_predicate: GenotypePolyPredicate,
+ phenotypes: typing.Sequence[P],
+ counts: typing.Sequence[pd.DataFrame],
+ ) -> typing.Sequence[PhenotypeMtcResult]:
+ """
+ Test if the phenotype with given counts should be included in the downstream analysis.
+
+ :param gt_predicate: the predicate that produced the columns of the `count` data frame.
+ :param phenotypes: the tested phenotypes.
+ :param counts: a sequence of 2D data frames for the tested phenotypes.
+ Each data frame corrresponds to a genotype/phenotype contingency matrix.
+ :returns: a sequence of filter results for the input `phenotypes`.
+ """
+ pass
+
@abc.abstractmethod
def filter_terms_to_test(
self,
@@ -58,13 +128,25 @@ def filter_method_name(self) -> str:
pass
-class UseAllTermsMtcFilter(PhenotypeMtcFilter):
+class UseAllTermsMtcFilter(PhenotypeMtcFilter[typing.Any]):
"""
`UseAllTermsMtcFilter` filters out *no* phenotype terms.
See :ref:`use-all-terms-strategy` section for more info.
"""
+ def __init__(self):
+ self._ok = PhenotypeMtcResult.ok()
+
+ def filter(
+ self,
+ gt_predicate: GenotypePolyPredicate,
+ phenotypes: typing.Sequence[P],
+ counts: typing.Sequence[pd.DataFrame],
+ ) -> typing.Sequence[PhenotypeMtcResult]:
+ # Always OK! 😏
+ return tuple(self._ok for _ in phenotypes)
+
def filter_terms_to_test(
self,
gt_predicate: GenotypePolyPredicate,
@@ -85,7 +167,7 @@ def filter_method_name(self) -> str:
return "All HPO terms"
-class SpecifiedTermsMtcFilter(PhenotypeMtcFilter):
+class SpecifiedTermsMtcFilter(PhenotypeMtcFilter[hpotk.TermId]):
"""
`SpecifiedTermsMtcFilter` limits the HPO terms to be tested to a selection of provided terms.
@@ -108,9 +190,25 @@ def __init__(
hpo: reference to HPO ontology object
terms_to_test: an iterable of TermIds representing the terms to test
"""
+ self._ok = PhenotypeMtcResult.ok()
+ self._fail = PhenotypeMtcResult.fail("Non-specified term")
self._hpo = hpo
self._terms_to_test_set = set(terms_to_test)
+ def filter(
+ self,
+ gt_predicate: GenotypePolyPredicate,
+ phenotypes: typing.Sequence[P],
+ counts: typing.Sequence[pd.DataFrame],
+ ) -> typing.Sequence[PhenotypeMtcResult]:
+ results = []
+ for phenotype in phenotypes:
+ if phenotype in self._terms_to_test_set:
+ results.append(self._ok)
+ else:
+ results.append(self._fail)
+ return tuple(results)
+
def filter_terms_to_test(
self,
gt_predicate: GenotypePolyPredicate,
@@ -153,12 +251,14 @@ def filter_method_name(self) -> str:
return "Specified terms MTC filter"
-class HpoMtcFilter(PhenotypeMtcFilter):
+class HpoMtcFilter(PhenotypeMtcFilter[hpotk.TermId]):
"""
`HpoMtcFilter` decides which phenotypes should be tested and which phenotypes are not worth testing.
The class leverages a number of heuristics and domain decisions.
See :ref:`hpo-mtc-filter-strategy` section for more info.
+
+ We recommend creating an instance using the :func:`default_filter` static factory method.
"""
# TODO: this has been here before. Is it still valid?
@@ -176,6 +276,7 @@ class HpoMtcFilter(PhenotypeMtcFilter):
def default_filter(
hpo: hpotk.MinimalOntology,
term_frequency_threshold: float,
+ phenotypic_abnormality: hpotk.TermId = hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY,
):
"""
Args:
@@ -184,21 +285,25 @@ def default_filter(
for an HPO term to have in at least one of the genotype groups
(e.g., 22% in missense and 3% in nonsense genotypes would be OK,
but not 13% missense and 10% nonsense genotypes if the threshold is 0.2)
+ phenotypic_abnormality: a :class:`~hpotk.TermId` corresponding to the root of HPO phenotype hierarchy.
+ Having to specify this option should be very rarely, if ever.
"""
- # ## Collect sets of top-level and second-level terms
+ # The very top of the ontology is completely uninteresting.
+ # Knowing about association between `All`, or `Phenotypic abnormality` tells us very little
+ top_level_terms = {phenotypic_abnormality, }
+ top_level_terms.update(hpo.graph.get_ancestors(phenotypic_abnormality))
+
+ # Collect sets of the 1st level terms
# e.g., Abnormality of the cardiovascular system (HP:0001626)
- top_level_terms = set(
- hpo.graph.get_children(hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY)
+ first_level_terms = set(
+ hpo.graph.get_children(phenotypic_abnormality)
)
- # e.g. Abnormality of the vasculature (HP:0002597)
- second_level_terms = set()
- third_level_terms = set()
-
# the second level (children of the top level), contains terms we want to keep,
# such as HP:0001611 Hypernasal speech, but it also contains "general" terms that
# we skip according to this heuristic, e.g., HP:0030680 Abnormal cardiovascular system morphology
- for t1 in top_level_terms:
+ second_level_terms = set()
+ for t1 in first_level_terms:
for t2 in hpo.graph.get_children(t1):
label = hpo.get_term_name(t2)
if label is not None and label.startswith("Abnormal"):
@@ -207,6 +312,7 @@ def default_filter(
# The third level (children of the second level), contains terms we want to keep,
# such as HP:0031109 Agalactia, but it also contains "general" terms
# that we skip according to this heuristic, e.g., HP:0006500 Abnormal lower limb epiphysis morphology
+ third_level_terms = set()
for t2 in second_level_terms:
for t3 in hpo.graph.get_children(t2):
label = hpo.get_term_name(t3)
@@ -217,6 +323,7 @@ def default_filter(
# In the future, one could do this on a per-level basis.
general_hpo_term_set = set()
general_hpo_term_set.update(top_level_terms)
+ general_hpo_term_set.update(first_level_terms)
general_hpo_term_set.update(second_level_terms)
general_hpo_term_set.update(third_level_terms)
@@ -248,6 +355,107 @@ def __init__(
# thus if the total count is 6 or less, there is no power - CAN WE SIMPLIFY?
self._general_hpo_terms = set(general_hpo_terms)
+ self._ok = PhenotypeMtcResult.ok()
+
+ def filter(
+ self,
+ gt_predicate: GenotypePolyPredicate,
+ phenotypes: typing.Sequence[hpotk.TermId],
+ counts: typing.Sequence[pd.DataFrame],
+ ) -> typing.Sequence[PhenotypeMtcResult]:
+ p_to_idx = {p: i for i, p in enumerate(phenotypes)}
+
+ results = [None for _ in range(len(phenotypes))]
+ categories = tuple(gt_predicate.get_categories())
+ for term_id in self._get_ordered_terms(phenotypes):
+ try:
+ idx = p_to_idx[term_id]
+ except KeyError:
+ # The procedure for getting the ordered terms produced a term
+ # that is not included in `phenotypes`.
+ # This is normal and we just move on.
+ continue
+
+ if term_id in self._general_hpo_terms:
+ results[idx] = PhenotypeMtcResult.fail("Skipping general term")
+ continue
+
+ if not self._hpo.graph.is_ancestor_of(
+ hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY, term_id
+ ):
+ results[idx] = PhenotypeMtcResult.fail("Skipping non phenotype term")
+ continue
+
+ # get total number of observations
+ # if term_id not in all_counts:
+ # reason_for_filtering_out["Skipping non-target term"] += 1
+ # continue
+
+ contingency_matrix = counts[idx]
+ total = contingency_matrix.sum().sum()
+ max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+ contingency_matrix
+ )
+ if max_freq < self._hpo_term_frequency_filter:
+ reason = (
+ "Skipping term with maximum frequency "
+ f"that was less than threshold {self._hpo_term_frequency_filter}"
+ )
+ results[idx] = PhenotypeMtcResult.fail(reason)
+ continue
+
+ if contingency_matrix.shape == (2, 2) and total < 7:
+ reason = f"Skipping term with only {total} observations (not powered for 2x2)"
+ results[idx] = PhenotypeMtcResult.fail(reason)
+ continue
+
+ # todo -- similar for (3,2)
+ if not HpoMtcFilter.some_cell_has_greater_than_one_count(contingency_matrix):
+ reason = "Skipping term because no genotype has more than one observed HPO count"
+ results[idx] = PhenotypeMtcResult.fail(reason)
+ continue
+
+ elif HpoMtcFilter.genotypes_have_same_hpo_proportions(
+ contingency_matrix,
+ categories,
+ ):
+ reason = "Skipping term because all genotypes have same HPO observed proportions"
+ results[idx] = PhenotypeMtcResult.fail(reason)
+ continue
+
+ elif HpoMtcFilter.one_genotype_has_zero_hpo_observations(
+ contingency_matrix,
+ categories,
+ ):
+ reason = "Skipping term because one genotype had zero observations"
+ results[idx] = PhenotypeMtcResult.fail(reason)
+ continue
+
+ # TODO: the code below actually did not work.
+ # for child_term_id in self._hpo.graph.get_children(term_id):
+ # if child_term_id in tested_counts_pf:
+ # if tested_counts_pf[child_term_id].equals(contingency_matrix):
+ # # TODO: should we make the match fuzzier by adding a tolerance instead of the exact matches?
+ # reason = "Child term with same counts previously tested"
+ # reason_for_filtering_out[reason] += 1
+ # continue
+
+ # The term should be tested if we get here.
+ results[idx] = self._ok
+
+ # There should be no `None` elements in `results` at this point.
+ if any(r is None for r in results):
+ missing = []
+ for i, result in enumerate(results):
+ if result is None:
+ term_name = self._hpo.get_term_name(phenotypes[i])
+ missing.append(term_name)
+
+ print('BLAAAAAA')
+ msg = 'Missing results for {}'.format(', '.join(missing))
+ raise ValueError(msg)
+
+ return tuple(results)
def filter_terms_to_test(
self,
@@ -275,6 +483,7 @@ def filter_terms_to_test(
) # key is an HP id, value is a tuple with counts, i.e.,
# iterate through the terms in the sorted order, starting from the leaves of the induced graph.
+ categories = tuple(gt_predicate.get_categories())
for term_id in self._get_ordered_terms(all_counts.keys()):
if term_id in self._general_hpo_terms:
reason_for_filtering_out["Skipping general term"] += 1
@@ -317,7 +526,7 @@ def filter_terms_to_test(
elif HpoMtcFilter.genotypes_have_same_hpo_proportions(
counts_frame,
- gt_predicate.get_categories(),
+ categories,
):
reason = "Skipping term because all genotypes have same HPO observed proportions"
reason_for_filtering_out[reason] += 1
@@ -325,7 +534,7 @@ def filter_terms_to_test(
elif HpoMtcFilter.one_genotype_has_zero_hpo_observations(
counts_frame,
- gt_predicate.get_categories(),
+ categories,
):
reason = "Skipping term because one genotype had zero observations"
reason_for_filtering_out[reason] += 1
@@ -454,26 +663,12 @@ def _get_ordered_terms(
("leaves" in the graph of observed terms, but not necessarily leaves in the entire HPO graph)
to the most general terms.
"""
- # When building the induced graph based on the `term_ids`, we do not want to
- # include the term for *Phenotypic abnormality* and *All*.
- ancestors_to_ignore = set(
- self._hpo.graph.get_ancestors(
- hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY
- )
- )
- ancestors_to_ignore.add(hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY)
-
# A "leaf" refers to an annotated term none of whose children is annotated.
# May or may not be leaf in HPO graph
all_term_ids = set()
for term_id in term_ids:
all_term_ids.add(term_id)
- all_term_ids.update(
- filter(
- lambda t: t not in ancestors_to_ignore,
- self._hpo.graph.get_ancestors(term_id),
- )
- )
+ all_term_ids.update(self._hpo.graph.get_ancestors(term_id))
# get all of the leaf terms
hpo_leaf_term_id_set = set()
@@ -489,7 +684,7 @@ def _get_ordered_terms(
ordered_term_list = list() # reset, if needed
queue = deque(hpo_leaf_term_id_set)
- seen_terms = set(ancestors_to_ignore)
+ seen_terms = set()
while len(queue) > 0:
term_id = queue.popleft()
if term_id not in seen_terms:
diff --git a/src/gpsea/analysis/pcats/__init__.py b/src/gpsea/analysis/pcats/__init__.py
new file mode 100644
index 00000000..6a82d92f
--- /dev/null
+++ b/src/gpsea/analysis/pcats/__init__.py
@@ -0,0 +1,29 @@
+"""
+The `gpsea.analysis.pcats` tests the association between genotype and phenotype groups,
+if the groups can be defined in terms of discrete, unique, and non-overlapping categories.
+
+Each individual is assigned into a genotype and phenotype group
+using :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`
+and :class:`~gpsea.analysis.predicate.phenotype.PhenotypePolyPredicate` respectively.
+
+A contingency matrix with group counts is prepared
+and the counts are tested for association using :class:`~gpsea.analysis.pcats.stats.CountStatistic`,
+such as :class:`~gpsea.analysis.pcats.stats.ScipyFisherExact`.
+
+It is typical to test several phenotype groups at the same time.
+Therefore, we must correct for multiple testing to prevent false positive findings.
+See :ref:`MTC section ` for more info.
+
+The results are provided as :class:`MultiPhenotypeAnalysisResult`
+(or more specific :class:`HpoTermAnalysisResult` for :class:`HpoTermAnalysis`).
+"""
+
+from ._impl import MultiPhenotypeAnalysis, MultiPhenotypeAnalysisResult
+from ._impl import DiseaseAnalysis
+from ._impl import HpoTermAnalysis, HpoTermAnalysisResult
+
+__all__ = [
+ 'MultiPhenotypeAnalysis', 'MultiPhenotypeAnalysisResult',
+ 'DiseaseAnalysis',
+ 'HpoTermAnalysis', 'HpoTermAnalysisResult',
+]
diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
new file mode 100644
index 00000000..61841a51
--- /dev/null
+++ b/src/gpsea/analysis/pcats/_impl.py
@@ -0,0 +1,561 @@
+import abc
+import math
+import typing
+
+from collections import Counter
+
+import hpotk
+import numpy as np
+import pandas as pd
+
+from statsmodels.stats import multitest
+
+from gpsea.model import Patient
+from gpsea.analysis.pcats.stats import CountStatistic
+from ..predicate import PatientCategory
+from ..predicate.genotype import GenotypePolyPredicate
+from ..predicate.phenotype import P, PhenotypePolyPredicate, prepare_predicates_for_terms_of_interest
+from ..mtc_filter import PhenotypeMtcFilter, PhenotypeMtcResult
+
+
+def apply_predicates_on_patients(
+ patients: typing.Iterable[Patient],
+ gt_predicate: GenotypePolyPredicate,
+ pheno_predicates: typing.Sequence[PhenotypePolyPredicate[P]],
+) -> typing.Tuple[
+ typing.Sequence[int],
+ typing.Sequence[pd.DataFrame],
+]:
+ """
+ Apply the phenotype predicates `pheno_predicates` and the genotype predicate `gt_predicate`
+ to bin the `patients` into categories.
+
+ Note, it may not be possible to bin *all* patients with a genotype/phenotype pair,
+ since a predicate is allowed to return `None` (e.g. if it bins the patient into MISSENSE or NONSENSE groups
+ but the patient has no MISSENSE or NONSENSE variants). If this happens, the patient will not be "usable"
+ for the phenotype `P`.
+
+ Args:
+ patients: a sequence of the patients to bin into categories
+ gt_predicate: a genotype predicate to apply
+ pheno_predicates: a sequence with the phenotype predicates to apply
+
+ Returns:
+ a tuple with 2 items:
+ - a sequence with counts of patients that could be binned according to the phenotype `P`.
+ - a sequence with data frames with counts of patients in i-th phenotype category
+ and j-th genotype category where i and j are rows and columns of the data frame.
+ """
+ n_usable_patient_counter = Counter()
+
+ # Apply genotype and phenotype predicates
+ count_dict = {}
+ for ph_predicate in pheno_predicates:
+ if ph_predicate.phenotype not in count_dict:
+ # Make an empty frame for keeping track of the counts.
+ count_dict[ph_predicate.phenotype] = pd.DataFrame(
+ data=0,
+ index=pd.Index(
+ data=ph_predicate.get_categories(),
+ name=ph_predicate.get_question(),
+ ),
+ columns=pd.Index(
+ data=gt_predicate.get_categories(),
+ name=gt_predicate.get_question(),
+ ),
+ )
+
+ for patient in patients:
+ pheno_cat = ph_predicate.test(patient)
+ geno_cat = gt_predicate.test(patient)
+
+ if pheno_cat is not None and geno_cat is not None:
+ count_dict[pheno_cat.phenotype].loc[
+ pheno_cat.category, geno_cat.category
+ ] += 1
+ n_usable_patient_counter[pheno_cat.phenotype] += 1
+
+ # Convert dicts to numpy arrays
+ n_usable_patients = [
+ n_usable_patient_counter[ph_predicate.phenotype]
+ for ph_predicate in pheno_predicates
+ ]
+
+ counts = [count_dict[ph_predicate.phenotype] for ph_predicate in pheno_predicates]
+
+ return n_usable_patients, counts
+
+
+class MultiPhenotypeAnalysisResult(typing.Generic[P], metaclass=abc.ABCMeta):
+ """
+ `MultiPhenotypeAnalysisResult` reports the outcome of :class:`MultiPhenotypeAnalysis`.
+
+ The result consists of several arrays with items
+ with the order determined by the phenotype order.
+ """
+
+ @property
+ @abc.abstractmethod
+ def phenotypes(self) -> typing.Sequence[P]:
+ """
+ Get the tested phenotypes.
+ """
+ pass
+
+ @property
+ @abc.abstractmethod
+ def n_usable(self) -> typing.Sequence[int]:
+ """
+ Get a sequence of numbers of patients where the phenotype was assessable,
+ and are, thus, usable for genotype-phenotype correlation analysis.
+ """
+ pass
+
+ @property
+ @abc.abstractmethod
+ def all_counts(self) -> typing.Sequence[pd.DataFrame]:
+ """
+ Get a :class:`~pandas.DataFrame` sequence where each `DataFrame` includes the counts of patients
+ in genotype and phenotype groups.
+
+ An example for a genotype predicate that bins into two categories (`Yes` and `No`) based on presence
+ of a missense variant in transcript `NM_123456.7`, and phenotype predicate that checks
+ presence/absence of `HP:0001166` (a phenotype term)::
+
+ Has MISSENSE_VARIANT in NM_123456.7
+ No Yes
+ Present
+ Yes 1 13
+ No 7 5
+
+ The rows correspond to the phenotype categories, and the columns represent the genotype categories.
+ """
+ pass
+
+ @property
+ @abc.abstractmethod
+ def pvals(self) -> typing.Sequence[float]:
+ """
+ Get a sequence of nominal p values for each tested HPO term.
+ The sequence includes a `NaN` value for each input phenotype that was *not* tested.
+ """
+ pass
+
+ @property
+ @abc.abstractmethod
+ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]:
+ """
+ Get a sequence with p values for each tested HPO term after multiple testing correction
+ or `None` if the correction was not applied.
+ The sequence includes a `NaN` value for each input phenotype that was *not* tested.
+ """
+ pass
+
+ @property
+ def total_tests(self) -> int:
+ """
+ Get total count of tests that were run for this analysis.
+ """
+ return sum(1 for pval in self.pvals if not math.isnan(pval))
+
+ def summarize(
+ self,
+ hpo: hpotk.MinimalOntology,
+ target_phenotype_category: PatientCategory,
+ ) -> pd.DataFrame:
+ """
+ Create a data frame with summary of the genotype phenotype analysis.
+
+ The *rows* of the frame correspond to the analyzed HPO terms.
+
+ The columns of the data frame have `Count` and `Percentage` per used genotype predicate.
+
+ **Example**
+
+ If we use :class:`~gpsea.analysis.predicate.genotype.VariantEffectPredicate`
+ which can compare phenotype with and without a missense variant, we will have a data frame
+ that looks like this::
+
+ MISSENSE_VARIANT on `NM_1234.5` No Yes
+ Count Percent Count Percent Corrected p value p value
+ Arachnodactyly [HP:0001166] 1/10 10% 13/16 81% 0.020299 0.000781
+ Abnormality of the musculature [HP:0003011] 6/6 100% 11/11 100% 1.000000 1.000000
+ Abnormal nervous system physiology [HP:0012638] 9/9 100% 15/15 100% 1.000000 1.000000
+ ... ... ... ... ... ... ...
+ """
+
+ # TODO: this should be plotted by a formatter.
+ # Row index: a list of tested HPO terms
+ pheno_idx = pd.Index(self.phenotypes)
+ # Column index: multiindex of counts and percentages for all genotype predicate groups
+ gt_idx = pd.MultiIndex.from_product(
+ # TODO: fix the below
+ iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")),
+ names=(self._gt_predicate.get_question(), None),
+ )
+
+ # We'll fill this frame with data
+ df = pd.DataFrame(index=pheno_idx, columns=gt_idx)
+
+ for phenotype, count in zip(self.phenotypes, self.all_counts):
+ gt_totals = (
+ count.sum()
+ ) # Sum across the phenotype categories (collapse the rows).
+ for gt_cat in count.columns:
+ cnt = count.loc[target_phenotype_category, gt_cat]
+ total = gt_totals[gt_cat]
+ df.loc[phenotype, (gt_cat, "Count")] = f"{cnt}/{total}"
+ pct = 0 if total == 0 else round(cnt * 100 / total)
+ df.loc[phenotype, (gt_cat, "Percent")] = f"{pct}%"
+
+ # Add columns with p values and corrected p values (if present)
+ p_val_col_name = "p values"
+ corrected_p_val_col_name = "Corrected p values"
+ if self.corrected_pvals is not None:
+ df.insert(
+ df.shape[1], ("", corrected_p_val_col_name), self.corrected_pvals
+ )
+ df.insert(df.shape[1], ("", p_val_col_name), self.pvals)
+
+ # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms
+ # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`).
+ labeled_idx = df.index.map(
+ lambda term_id: MultiPhenotypeAnalysisResult._format_term_id(hpo, term_id)
+ )
+
+ # Last, sort by corrected p value or just p value
+ df = df.set_index(labeled_idx)
+ if self.corrected_pvals is not None:
+ return df.sort_values(
+ by=[("", corrected_p_val_col_name), ("", p_val_col_name)]
+ )
+ else:
+ return df.sort_values(by=("", p_val_col_name))
+
+ @staticmethod
+ def _format_term_id(
+ hpo: hpotk.MinimalOntology,
+ term_id: hpotk.TermId,
+ ) -> str:
+ """
+ Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs
+ are formatted as CURIEs (e.g. `OMIM:123000`).
+ """
+ if term_id.prefix == "HP":
+ term_name = hpo.get_term_name(term_id)
+ return f"{term_name} [{term_id.value}]"
+ else:
+ return term_id.value
+
+
+class MultiPhenotypeAnalysis(typing.Generic[P], metaclass=abc.ABCMeta):
+
+ def __init__(
+ self,
+ count_statistic: CountStatistic,
+ mtc_correction: typing.Optional[str] = None,
+ mtc_alpha: float = 0.05,
+ ):
+ assert isinstance(count_statistic, CountStatistic)
+ self._count_statistic = count_statistic
+ self._mtc_correction = mtc_correction
+ self._mtc_alpha = mtc_alpha
+
+ @abc.abstractmethod
+ def compare_genotype_vs_phenotypes(
+ self,
+ cohort: typing.Iterable[Patient],
+ gt_predicate: GenotypePolyPredicate,
+ pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
+ ) -> MultiPhenotypeAnalysisResult[P]:
+ pass
+
+ def _compute_nominal_pvals(
+ self,
+ n_usable: typing.Iterable[int],
+ all_counts: typing.Iterable[pd.DataFrame],
+ ) -> np.ndarray:
+ pvals = []
+ for i, (usable, count) in enumerate(zip(n_usable, all_counts)):
+ if usable == 0:
+ pvals.append(np.nan)
+ else:
+ try:
+ pval = self._count_statistic.compute_pval(count)
+ pvals.append(pval)
+ except ValueError as ve:
+ # TODO: add more context to the exception?
+ raise ve
+
+ return np.array(pvals)
+
+ def _apply_mtc(
+ self,
+ pvals: typing.Sequence[float],
+ ) -> typing.Optional[typing.Sequence[float]]:
+ _, corrected_pvals, _, _ = multitest.multipletests(
+ pvals=pvals,
+ alpha=self._mtc_alpha,
+ method=self._mtc_correction,
+ is_sorted=False,
+ returnsorted=False,
+ )
+ return corrected_pvals
+
+
+class BaseMultiPhenotypeAnalysisResult(typing.Generic[P], MultiPhenotypeAnalysisResult[P]):
+
+ def __init__(
+ self,
+ phenotypes: typing.Sequence[P],
+ n_usable: typing.Sequence[int],
+ all_counts: typing.Sequence[pd.DataFrame],
+ pvals: typing.Sequence[float],
+ corrected_pvals: typing.Optional[typing.Sequence[float]],
+ gt_predicate: GenotypePolyPredicate,
+ ):
+ self._phenotypes = tuple(phenotypes)
+ self._n_usable = tuple(n_usable)
+ self._all_counts = tuple(all_counts)
+ self._pvals = tuple(pvals)
+ self._corrected_pvals = (
+ None if corrected_pvals is None else tuple(corrected_pvals)
+ )
+ assert isinstance(gt_predicate, GenotypePolyPredicate)
+ self._gt_predicate = gt_predicate
+
+ @property
+ def phenotypes(self) -> typing.Sequence[P]:
+ return self._phenotypes
+
+ @property
+ def n_usable(self) -> typing.Sequence[int]:
+ return self._n_usable
+
+ @property
+ def all_counts(self) -> typing.Sequence[pd.DataFrame]:
+ return self._all_counts
+
+ @property
+ def pvals(self) -> typing.Sequence[float]:
+ return self._pvals
+
+ @property
+ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]:
+ return self._corrected_pvals
+
+ def __eq__(self, other):
+ return isinstance(other, BaseMultiPhenotypeAnalysisResult) \
+ and self._phenotypes == other._phenotypes \
+ and self._n_usable == other._n_usable \
+ and self._all_counts == other._all_counts \
+ and self._pvals == other._pvals \
+ and self._corrected_pvals == other._corrected_pvals\
+ and self._gt_predicate == other._gt_predicate
+
+ def __hash__(self):
+ # NOTE: if a field is added, the hash method of the subclasses must be updated as well.
+ return hash((
+ self._phenotypes, self._n_usable, self._all_counts,
+ self._pvals, self._corrected_pvals, self._gt_predicate,
+ ))
+
+
+class DiseaseAnalysis(MultiPhenotypeAnalysis[hpotk.TermId]):
+
+ def compare_genotype_vs_phenotypes(
+ self,
+ cohort: typing.Iterable[Patient],
+ gt_predicate: GenotypePolyPredicate,
+ pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]],
+ ) -> MultiPhenotypeAnalysisResult[hpotk.TermId]:
+ pheno_predicates = tuple(pheno_predicates)
+ if len(pheno_predicates) == 0:
+ raise ValueError("No phenotype predicates were provided")
+
+ # 1 - Count the patients
+ n_usable, all_counts = apply_predicates_on_patients(
+ patients=cohort,
+ gt_predicate=gt_predicate,
+ pheno_predicates=pheno_predicates,
+ )
+
+ # 2 - Compute nominal p values
+ pvals = self._compute_nominal_pvals(n_usable=n_usable, all_counts=all_counts)
+
+ # 3 - Apply Multiple Testing Correction
+ if self._mtc_correction is None:
+ corrected_pvals = None
+ else:
+ corrected_pvals = self._apply_mtc(pvals=pvals)
+
+ phenotypes = tuple(p.phenotype for p in pheno_predicates)
+
+ return BaseMultiPhenotypeAnalysisResult(
+ phenotypes=phenotypes,
+ n_usable=n_usable,
+ all_counts=all_counts,
+ pvals=pvals,
+ corrected_pvals=corrected_pvals,
+ gt_predicate=gt_predicate,
+ )
+
+
+class HpoTermAnalysisResult(BaseMultiPhenotypeAnalysisResult[hpotk.TermId]):
+ """
+ `HpoTermAnalysisResult` includes the :class:`HpoTermAnalysis` results.
+
+ On top of the attributes of :class:`MultiPhenotypeAnalysisResult` parent,
+ the results include the outcome of :class:`PhenotypeMtcFilter`.
+ """
+
+ def __init__(
+ self,
+ phenotypes: typing.Sequence[hpotk.TermId],
+ n_usable: typing.Sequence[int],
+ all_counts: typing.Sequence[pd.DataFrame],
+ pvals: typing.Sequence[float],
+ corrected_pvals: typing.Optional[typing.Sequence[float]],
+ gt_predicate: GenotypePolyPredicate,
+ mtc_filter_name: str,
+ mtc_filter_results: typing.Sequence[PhenotypeMtcResult],
+ mtc_name: typing.Optional[str],
+ ):
+ super().__init__(
+ phenotypes=phenotypes,
+ n_usable=n_usable,
+ all_counts=all_counts,
+ pvals=pvals,
+ corrected_pvals=corrected_pvals,
+ gt_predicate=gt_predicate,
+ )
+ self._mtc_filter_name = mtc_filter_name
+ self._mtc_filter_results = tuple(mtc_filter_results)
+ self._mtc_name = mtc_name
+
+ @property
+ def mtc_filter_name(self) -> str:
+ """
+ Get the MTC filter name.
+ """
+ return self._mtc_filter_name
+
+ @property
+ def mtc_filter_results(self) -> typing.Sequence[PhenotypeMtcResult]:
+ """
+ Get a :class:`PhenotypeMtcResult` for each of the :attr:`phenotypes`.
+ """
+ return self._mtc_filter_results
+
+ @property
+ def mtc_name(self) -> typing.Optional[str]:
+ """
+ Get the name of the multiple testing correction (MTC) procedure (e.g. `bonferroni`, `fdr_bh`, ...)
+ or `None` if no MTC was performed.
+ """
+ return self._mtc_name
+
+ def __eq__(self, other):
+ return isinstance(other, HpoTermAnalysisResult) \
+ and BaseMultiPhenotypeAnalysisResult.__eq__(self, other) \
+ and self._mtc_filter_name == other._mtc_filter_name \
+ and self._mtc_filter_results == other._mtc_filter_results \
+ and self._mtc_name == other._mtc_name
+
+ def __hash__(self):
+ return hash((
+ self._phenotypes, self._n_usable, self._all_counts,
+ self._pvals, self._corrected_pvals, self._gt_predicate,
+ self._mtc_filter_name, self._mtc_filter_results, self._mtc_name,
+ ))
+
+
+class HpoTermAnalysis(MultiPhenotypeAnalysis[hpotk.TermId]):
+ """
+ `HpoTermAnalysis` can be applied if the individual phenotypes are represented as HPO terms.
+
+ The analysis applies the genotype and phenotype predicates, computes the nominal p values,
+ and addresses the multiple testing burden by applying the :class:`PhenotypeMtcFilter`
+ followed by the multiple testing correction `mtc_correction` method.
+
+ `PhenotypeMtcFilter` is applied even if no MTC should be applied.
+ """
+
+ def __init__(
+ self,
+ count_statistic: CountStatistic,
+ mtc_filter: PhenotypeMtcFilter,
+ mtc_correction: typing.Optional[str] = None,
+ mtc_alpha: float = 0.05,
+ ):
+ super().__init__(
+ count_statistic=count_statistic,
+ mtc_correction=mtc_correction,
+ mtc_alpha=mtc_alpha,
+ )
+ assert isinstance(mtc_filter, PhenotypeMtcFilter)
+ self._mtc_filter = mtc_filter
+
+ def compare_genotype_vs_phenotypes(
+ self,
+ cohort: typing.Iterable[Patient],
+ gt_predicate: GenotypePolyPredicate,
+ pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]],
+ ) -> HpoTermAnalysisResult:
+ pheno_predicates = tuple(pheno_predicates)
+ if len(pheno_predicates) == 0:
+ raise ValueError("No phenotype predicates were provided")
+ phenotypes = tuple(p.phenotype for p in pheno_predicates)
+
+ # 1 - Count the patients
+ n_usable, all_counts = apply_predicates_on_patients(
+ cohort,
+ gt_predicate,
+ pheno_predicates,
+ )
+
+ # 2 - Apply MTC filter and select p values to MTC
+ mtc_filter_results = self._mtc_filter.filter(
+ gt_predicate=gt_predicate,
+ phenotypes=phenotypes,
+ counts=all_counts,
+ )
+ mtc_mask = np.array([r.is_passed() for r in mtc_filter_results])
+
+ # 3 - Compute nominal p values
+ pvals = np.full(shape=(len(n_usable),), fill_value=np.nan)
+ pvals[mtc_mask] = self._compute_nominal_pvals(
+ n_usable=slice_list_in_numpy_style(n_usable, mtc_mask),
+ all_counts=slice_list_in_numpy_style(all_counts, mtc_mask),
+ )
+
+ # 4 - Apply Multiple Testing Correction
+ if self._mtc_correction is None:
+ corrected_pvals = None
+ else:
+ corrected_pvals = np.full(shape=pvals.shape, fill_value=np.nan)
+ # Do not test the p values that have been filtered out.
+ corrected_pvals[mtc_mask] = self._apply_mtc(pvals=pvals[mtc_mask])
+
+ return HpoTermAnalysisResult(
+ phenotypes=phenotypes,
+ n_usable=n_usable,
+ all_counts=all_counts,
+ pvals=pvals,
+ corrected_pvals=corrected_pvals,
+ gt_predicate=gt_predicate,
+ mtc_filter_name=self._mtc_filter.filter_method_name(),
+ mtc_filter_results=mtc_filter_results,
+ mtc_name=self._mtc_correction,
+ )
+
+
+WHATEVER = typing.TypeVar('WHATEVER')
+
+
+def slice_list_in_numpy_style(
+ a: typing.Sequence[WHATEVER],
+ mask: typing.Sequence[bool],
+) -> typing.Sequence[WHATEVER]:
+ assert len(a) == len(mask)
+ return [item for item, mask_is_set in zip(a, mask) if mask_is_set]
diff --git a/src/gpsea/analysis/pcats/stats/__init__.py b/src/gpsea/analysis/pcats/stats/__init__.py
new file mode 100644
index 00000000..4d7e0543
--- /dev/null
+++ b/src/gpsea/analysis/pcats/stats/__init__.py
@@ -0,0 +1,5 @@
+from ._stats import CountStatistic, ScipyFisherExact, PythonMultiFisherExact
+
+__all__ = [
+ 'CountStatistic', 'ScipyFisherExact', 'PythonMultiFisherExact',
+]
diff --git a/src/gpsea/analysis/pcats/stats/_stats.py b/src/gpsea/analysis/pcats/stats/_stats.py
new file mode 100644
index 00000000..8df07fdc
--- /dev/null
+++ b/src/gpsea/analysis/pcats/stats/_stats.py
@@ -0,0 +1,177 @@
+import abc
+import math
+from decimal import Decimal
+
+
+import numpy as np
+import pandas as pd
+
+import scipy
+
+
+class CountStatistic(metaclass=abc.ABCMeta):
+ """
+ `CountStatistic` calculates a p value for a contingency table
+ produced by a pair of discrete random variables.
+
+ The `counts` table is usually `2x2` or `2x3`.
+ """
+
+ @abc.abstractmethod
+ def compute_pval(
+ self,
+ counts: pd.DataFrame,
+ ) -> float:
+ pass
+
+
+class ScipyFisherExact(CountStatistic):
+ """
+ `ScipyFisherExact` performs Fisher Exact Test on a `2x2` contingency table.
+
+ The class is a thin wrapper around Scipy :func:`~scipy.stats.fisher_exact` function.
+ The two-sided :math:`H_1` is considered.
+ """
+
+ def compute_pval(
+ self,
+ counts: pd.DataFrame,
+ ) -> float:
+ assert counts.shape == (
+ 2,
+ 2,
+ ), f"Cannot run Fisher exact test on an array with {counts.shape} shape"
+ _, pval = scipy.stats.fisher_exact(counts.values, alternative="two-sided")
+ return pval
+
+
+class PythonMultiFisherExact(CountStatistic):
+ """
+ `PythonMultiFisherExact` is a Python implementation of Fisher Exact Test to compute
+ p value for a `2x3` contingency table.
+ """
+
+ def compute_pval(
+ self,
+ counts: pd.DataFrame,
+ ) -> float:
+ PythonMultiFisherExact._check_input(counts)
+ return self._fisher_exact(counts.values)
+
+ @staticmethod
+ def _check_input(a: pd.DataFrame):
+ if not isinstance(a, pd.DataFrame):
+ raise ValueError(f"Expected a pandas DataFrame but got {type(a)}")
+ if not a.shape == (2, 3):
+ raise ValueError(f"Shape of the array must be (2, 3) but got {a.shape}")
+ if np.array_equal(a.values, np.zeros_like(a)):
+ raise ValueError("Data frame is all zeros, cannot run analysis")
+
+ def _fisher_exact(
+ self,
+ table: np.ndarray,
+ ):
+ row_sum = []
+ col_sum = []
+
+ for i in range(len(table)):
+ temp = 0
+ for j in range(len(table[0])):
+ temp += table[i][j]
+ row_sum.append(temp)
+
+ for j in range(len(table[0])):
+ temp = 0
+ for i in range(len(table)):
+ temp += table[i][j]
+ col_sum.append(temp)
+
+ mat = [[0] * len(col_sum)] * len(row_sum)
+ pos = (0, 0)
+
+ p_0 = 1
+
+ for x in row_sum:
+ p_0 *= math.factorial(x)
+ for y in col_sum:
+ p_0 *= math.factorial(y)
+
+ n = 0
+ for x in row_sum:
+ n += x
+ p_0 /= Decimal(math.factorial(n))
+
+ for i in range(len(table)):
+ for j in range(len(table[0])):
+ p_0 /= Decimal(math.factorial(table[i][j]))
+
+ p = [0]
+ self._dfs(mat, pos, row_sum, col_sum, p_0, p)
+
+ return float(p[0])
+
+ def _dfs(self, mat, pos, r_sum, c_sum, p_0, p):
+
+ (xx, yy) = pos
+ (r, c) = (len(r_sum), len(c_sum))
+
+ mat_new = []
+
+ for i in range(len(mat)):
+ temp = []
+ for j in range(len(mat[0])):
+ temp.append(mat[i][j])
+ mat_new.append(temp)
+
+ if xx == -1 and yy == -1:
+ for i in range(r - 1):
+ temp = r_sum[i]
+ for j in range(c - 1):
+ temp -= mat_new[i][j]
+ mat_new[i][c - 1] = temp
+ for j in range(c - 1):
+ temp = c_sum[j]
+ for i in range(r - 1):
+ temp -= mat_new[i][j]
+ mat_new[r - 1][j] = temp
+ temp = r_sum[r - 1]
+ for j in range(c - 1):
+ temp -= mat_new[r - 1][j]
+ if temp < 0:
+ return
+ mat_new[r - 1][c - 1] = temp
+
+ p_1 = 1
+ for x in r_sum:
+ p_1 *= math.factorial(x)
+ for y in c_sum:
+ p_1 *= math.factorial(y)
+
+ n = 0
+ for x in r_sum:
+ n += x
+ p_1 /= Decimal(math.factorial(n))
+
+ for i in range(len(mat_new)):
+ for j in range(len(mat_new[0])):
+ p_1 /= Decimal(math.factorial(mat_new[i][j]))
+ if p_1 <= p_0 + Decimal(0.00000001):
+ # print(mat_new)
+ # print(p_1)
+ p[0] += p_1
+ else:
+ max_1 = r_sum[xx]
+ max_2 = c_sum[yy]
+ for j in range(c):
+ max_1 -= mat_new[xx][j]
+ for i in range(r):
+ max_2 -= mat_new[i][yy]
+ for k in range(min(max_1, max_2) + 1):
+ mat_new[xx][yy] = k
+ if xx == r - 2 and yy == c - 2:
+ pos_new = (-1, -1)
+ elif xx == r - 2:
+ pos_new = (0, yy + 1)
+ else:
+ pos_new = (xx + 1, yy)
+ self._dfs(mat_new, pos_new, r_sum, c_sum, p_0, p)
diff --git a/src/gpsea/analysis/pcats/stats/_test__stats.py b/src/gpsea/analysis/pcats/stats/_test__stats.py
new file mode 100644
index 00000000..aa254306
--- /dev/null
+++ b/src/gpsea/analysis/pcats/stats/_test__stats.py
@@ -0,0 +1,33 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from ._stats import PythonMultiFisherExact
+
+
+class TestPythonMultiFisherExact:
+
+ @pytest.fixture
+ def fisher_exact(self) -> PythonMultiFisherExact:
+ return PythonMultiFisherExact()
+
+ @pytest.mark.parametrize(
+ "counts, expected",
+ (
+ [[[2, 1, 0], [3, 0, 2]], 0.6428571428571429],
+ [[[5, 5, 5], [5, 5, 5]], 1],
+ [[[10, 2, 3], [1, 3, 4]], 0.03952977071835599],
+ [[[6, 0, 0], [6, 5, 8]], 0.02334274421230943],
+ ),
+ )
+ def test_compute_pval(
+ self,
+ counts,
+ expected,
+ fisher_exact: PythonMultiFisherExact,
+ ):
+ contingency_matrix = pd.DataFrame(np.array(counts, dtype=np.int64))
+
+ final_pval = fisher_exact.compute_pval(contingency_matrix)
+ assert final_pval == pytest.approx(expected)
+
diff --git a/src/gpsea/analysis/predicate/__init__.py b/src/gpsea/analysis/predicate/__init__.py
index a2e0cb54..b4f3b28b 100644
--- a/src/gpsea/analysis/predicate/__init__.py
+++ b/src/gpsea/analysis/predicate/__init__.py
@@ -1,12 +1,7 @@
-from . import genotype
-from . import phenotype
-
from ._api import PatientCategory, PatientCategories, Categorization, C
from ._api import PolyPredicate
-from ._api import GenotypePolyPredicate, GenotypeBooleanPredicate
__all__ = [
'PatientCategory', 'PatientCategories', 'Categorization', 'C',
'PolyPredicate',
- 'GenotypePolyPredicate', 'GenotypeBooleanPredicate',
]
diff --git a/src/gpsea/analysis/predicate/_api.py b/src/gpsea/analysis/predicate/_api.py
index 2f1a62ea..8c62e191 100644
--- a/src/gpsea/analysis/predicate/_api.py
+++ b/src/gpsea/analysis/predicate/_api.py
@@ -148,11 +148,38 @@ def get_categorizations(self) -> typing.Sequence[C]:
"""
pass
- def get_categories(self) -> typing.Sequence[PatientCategory]:
+ def get_categories(self) -> typing.Iterator[PatientCategory]:
"""
- Get a sequence with :class:`PatientCategory` instances that the predicate can produce.
+ Get an iterator with :class:`PatientCategory` instances that the predicate can produce.
"""
- return tuple(c.category for c in self.get_categorizations())
+ return (c.category for c in self.get_categorizations())
+
+ def get_category(
+ self,
+ cat_id: int,
+ ) -> PatientCategory:
+ """
+ Get the category name for a :class:`PatientCategory.cat_id`.
+
+ :param cat_id: an `int` with the id.
+ :raises: ValueError if there is no such category was defined.
+ """
+ for ctg in self.get_categories():
+ if ctg.cat_id == cat_id:
+ return ctg.name
+ raise ValueError(f'No category for {cat_id} was found')
+
+ def get_category_name(
+ self,
+ cat_id: int,
+ ) -> str:
+ """
+ Get the category name for a :class:`PatientCategory.cat_id`.
+
+ :param cat_id: an `int` with the id.
+ :raises: ValueError if there is no such category was defined.
+ """
+ return self.get_category(cat_id).name
@abc.abstractmethod
def get_question(self) -> str:
@@ -183,35 +210,3 @@ def _check_patient(patient: Patient):
"""
if not isinstance(patient, Patient):
raise ValueError(f"patient must be type Patient but was type {type(patient)}")
-
-
-class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta):
- """
- `GenotypePolyPredicate` constrains `PolyPredicate` to investigate the genotype aspects
- of patients.
- """
- pass
-
-
-class GenotypeBooleanPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta):
- """
- `GenotypeBooleanPredicate` tests if a :class:`~gpsea.model.Patient` belongs to a genotype group
- and returns a boolean binning.
- """
- YES = Categorization(PatientCategories.YES)
- NO = Categorization(PatientCategories.NO)
-
- def get_categorizations(self) -> typing.Sequence[Categorization]:
- """
- The predicate bins a patient into :class:`BooleanPredicate.NO` or :class:`BooleanPredicate.YES` category.
- """
- return GenotypeBooleanPredicate.YES, GenotypeBooleanPredicate.NO
-
-
-class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta):
- BOTH = Categorization(PatientCategory(0, 'Both', 'The patient belongs in both groups.'))
- ONE = Categorization(PatientCategory(1, 'One', 'The patient belongs in one of the two groups.'))
- NEITHER = Categorization(PatientCategory(2, 'Neither', 'The patient does not belong in either group.'))
-
- def get_categorizations(self) -> typing.Sequence[Categorization]:
- return RecessiveGroupingPredicate.BOTH, RecessiveGroupingPredicate.ONE, RecessiveGroupingPredicate.NEITHER
diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py
index feff1703..a8a2ba91 100644
--- a/src/gpsea/analysis/predicate/genotype/__init__.py
+++ b/src/gpsea/analysis/predicate/genotype/__init__.py
@@ -1,9 +1,11 @@
+from ._api import GenotypePolyPredicate
from ._api import VariantPredicate
from ._counter import AlleleCounter
from ._gt_predicates import boolean_predicate, groups_predicate, recessive_predicate
from ._variant import VariantPredicates, ProteinPredicates
__all__ = [
+ 'GenotypePolyPredicate',
'boolean_predicate', 'groups_predicate', 'recessive_predicate',
'AlleleCounter', 'VariantPredicate',
'VariantPredicates', 'ProteinPredicates',
diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py
index 5e1a201f..39cf020b 100644
--- a/src/gpsea/analysis/predicate/genotype/_api.py
+++ b/src/gpsea/analysis/predicate/genotype/_api.py
@@ -2,13 +2,31 @@
import typing
from gpsea.model import Variant
+from .._api import PolyPredicate, Categorization, PatientCategory
+
+
+class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta):
+ """
+ `GenotypePolyPredicate` is a base class for all :class:`PolyPredicate`
+ that test the genotype axis.
+ """
+ pass
+
+
+class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta):
+ BOTH = Categorization(PatientCategory(0, 'Both', 'The patient belongs in both groups.'))
+ ONE = Categorization(PatientCategory(1, 'One', 'The patient belongs in one of the two groups.'))
+ NEITHER = Categorization(PatientCategory(2, 'Neither', 'The patient does not belong in either group.'))
+
+ def get_categorizations(self) -> typing.Sequence[Categorization]:
+ return RecessiveGroupingPredicate.BOTH, RecessiveGroupingPredicate.ONE, RecessiveGroupingPredicate.NEITHER
class VariantPredicate(metaclass=abc.ABCMeta):
"""
`VariantPredicate` tests if a variant meets a certain criterion.
- The subclasses are expected to implement all abstract methods of this class
+ The subclasses are expected to implement all abstract methods of this class
*plus* ``__eq__`` and ``__hash__``, to support building of compound predicates.
We *strongly* recommend implementing ``__str__`` and ``__repr__`` as well.
@@ -43,7 +61,7 @@ def __and__(self, other):
return self
if isinstance(self, AllVariantPredicate) and isinstance(other, AllVariantPredicate):
- # Merging two *all* variant predicates is equivalent
+ # Merging two *all* variant predicates is equivalent
# to chaining their predicates
return AllVariantPredicate(*self.predicates, *other.predicates)
else:
@@ -60,7 +78,7 @@ def __or__(self, other):
return self
if isinstance(self, AnyVariantPredicate) and isinstance(other, AnyVariantPredicate):
- # Merging two any variant predicates is equivalent
+ # Merging two any variant predicates is equivalent
# to chaining their predicates
return AnyVariantPredicate(*self.predicates, *other.predicates)
else:
@@ -71,7 +89,7 @@ def __or__(self, other):
def __invert__(self):
"""
Create a variant predicate that passes if the underlying predicate fails.
- """
+ """
if isinstance(self, InvVariantPredicate):
# Unwrap to prevent double negation
return self._inner
@@ -89,7 +107,6 @@ def __init__(
if len(predicates) == 0:
raise ValueError('Predicates must not be empty!')
self._predicates = tuple(predicates)
-
@property
def predicates(self) -> typing.Sequence[VariantPredicate]:
@@ -169,4 +186,3 @@ def __str__(self) -> str:
def __repr__(self) -> str:
return f'NotVariantPredicate(inner={self._inner})'
-
\ No newline at end of file
diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
index be0251d2..3f54a7e0 100644
--- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
+++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -2,17 +2,19 @@
from gpsea.model import Patient
-from .._api import Categorization
-from .._api import GenotypePolyPredicate, GenotypeBooleanPredicate, RecessiveGroupingPredicate
+from .._api import Categorization, PatientCategories
+from ._api import GenotypePolyPredicate, RecessiveGroupingPredicate
from ._api import VariantPredicate
from ._counter import AlleleCounter
-class AlleleCountingGenotypeBooleanPredicate(GenotypeBooleanPredicate):
+class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate):
# NOT PART OF THE PUBLIC API
"""
The predicate tests presence of at least one matching allele in the patient.
"""
+ YES = Categorization(PatientCategories.YES)
+ NO = Categorization(PatientCategories.NO)
@staticmethod
def for_variant_predicate(predicate: VariantPredicate):
@@ -22,6 +24,14 @@ def for_variant_predicate(predicate: VariantPredicate):
def __init__(self, allele_counter: AlleleCounter):
self._allele_counter = allele_counter
+ def get_categorizations(self) -> typing.Sequence[Categorization]:
+ """
+ The predicate bins a patient into
+ :attr:`AlleleCountingGenotypeBooleanPredicate.NO`
+ or :class:`AlleleCountingGenotypeBooleanPredicate.YES` category.
+ """
+ return AlleleCountingGenotypeBooleanPredicate.YES, AlleleCountingGenotypeBooleanPredicate.NO
+
def get_question(self) -> str:
return self._allele_counter.get_question()
@@ -30,16 +40,17 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]:
allele_count = self._allele_counter.count(patient)
if allele_count > 0:
- return GenotypeBooleanPredicate.YES
+ return AlleleCountingGenotypeBooleanPredicate.YES
elif allele_count == 0:
- return GenotypeBooleanPredicate.NO
+ return AlleleCountingGenotypeBooleanPredicate.NO
else:
raise ValueError(
f"Allele counter should return a non-negative allele count: {allele_count}"
)
def __eq__(self, value: object) -> bool:
- return isinstance(value, AlleleCountingGenotypeBooleanPredicate) and self._allele_counter == value._allele_counter
+ return isinstance(value, AlleleCountingGenotypeBooleanPredicate) \
+ and self._allele_counter == value._allele_counter
def __hash__(self) -> int:
return hash((self._allele_counter,))
@@ -52,7 +63,7 @@ def __repr__(self) -> str:
# TODO: write AD, AR, XLR, XLD
-def boolean_predicate(variant_predicate: VariantPredicate) -> GenotypeBooleanPredicate:
+def boolean_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate:
"""
Create a genotype boolean predicate from given `variant_predicate`
to test for presence of at least one matching allele in the patient.
diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py
index 517f5970..aeaf6e1d 100644
--- a/src/gpsea/analysis/predicate/genotype/_variant.py
+++ b/src/gpsea/analysis/predicate/genotype/_variant.py
@@ -30,12 +30,12 @@ def all(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate:
Build a predicate to test if variant has a functional annotation to genes *SURF1* and *SURF2*:
>>> from gpsea.analysis.predicate.genotype import VariantPredicates
-
+
>>> genes = ('SURF1', 'SURF2',)
>>> predicate = VariantPredicates.all(VariantPredicates.gene(g) for g in genes)
>>> predicate.get_question()
'(impacts SURF1 AND impacts SURF2)'
-
+
Args:
predicates: an iterable of predicates to test
"""
@@ -55,11 +55,12 @@ def any(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate:
**Example**
- Build a predicate to test if variant leads to a missense or non-sense change on a fictive transcript `NM_123456.7`:
+ Build a predicate to test if variant leads to a missense
+ or nonsense change on a fictional transcript `NM_123456.7`:
>>> from gpsea.model import VariantEffect
>>> from gpsea.analysis.predicate.genotype import VariantPredicates
-
+
>>> tx_id = 'NM_123456.7'
>>> effects = (VariantEffect.MISSENSE_VARIANT, VariantEffect.STOP_GAINED,)
>>> predicate = VariantPredicates.any(VariantPredicates.variant_effect(e, tx_id) for e in effects)
@@ -140,7 +141,8 @@ def exon(
Prepare a :class:`VariantPredicate` that tests if the variant overlaps with an exon of a specific transcript.
Args:
- exon: a non-negative `int` with the index of the target exon (e.g. `0` for the 1st exon, `1` for the 2nd, ...)
+ exon: a non-negative `int` with the index of the target exon
+ (e.g. `0` for the 1st exon, `1` for the 2nd, ...)
tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`)
"""
return VariantExonPredicate(exon, tx_id)
@@ -148,10 +150,12 @@ def exon(
@staticmethod
def region(region: Region, tx_id: str) -> VariantPredicate:
"""
- Prepare a :class:`VariantPredicate` that tests if the variant overlaps with a region on a protein of a specific transcript.
+ Prepare a :class:`VariantPredicate` that tests if the variant
+ overlaps with a region on a protein of a specific transcript.
Args:
- region: a :class:`Region` that gives the start and end coordinate of the region of interest on a protein strand.
+ region: a :class:`Region` that gives the start and end coordinate
+ of the region of interest on a protein strand.
"""
return ProteinRegionPredicate(region, tx_id)
@@ -173,15 +177,20 @@ def is_structural_variant(
SVs are usually defined as variant affecting more than a certain number of base pairs.
The thresholds vary in the literature, but here we use 50bp as a default.
- Any variant that affects at least `threshold` base pairs is considered an SV.
- Large SVs with unknown breakpoint coordinates or translocations (:class:`VariantClass.BND`)
+ Any variant that affects at least `threshold` base pairs is considered an SV.
+ Large SVs with unknown breakpoint coordinates or translocations (:class:`VariantClass.BND`)
are always considered as an SV.
Args:
threshold: a non-negative `int` with the number of base pairs that must be affected
"""
- assert threshold >= 0, '`threshold` must be non-negative!'
- return VariantPredicates.change_length('<=', -threshold) | VariantPredicates.change_length('>=', threshold) | VariantPredicates.is_large_imprecise_sv() | IS_BND
+ assert threshold >= 0, "`threshold` must be non-negative!"
+ return (
+ VariantPredicates.change_length("<=", -threshold)
+ | VariantPredicates.change_length(">=", threshold)
+ | VariantPredicates.is_large_imprecise_sv()
+ | IS_BND
+ )
@staticmethod
def structural_type(
@@ -190,7 +199,8 @@ def structural_type(
"""
Prepare a :class:`VariantPredicate` for testing if the variant has a certain structural type.
- We recommend using a descendant of `structural_variant` (`SO:0001537 `_)
+ We recommend using a descendant of `structural_variant`
+ (`SO:0001537 `_)
as the structural type.
**Example**
@@ -237,51 +247,12 @@ def ref_length(
length: int,
) -> VariantPredicate:
"""
- Prepare a :class:`VariantPredicate` for testing if the reference (REF) allele
+ Prepare a :class:`VariantPredicate` for testing if the reference (REF) allele
of variant is above, below, or (not) equal to certain `length`.
- The length of the REF allele corresponds to the length of the genomic region affected by the variant.
- Let's show a few examples.
-
- >>> from gpsea.model import VariantCoordinates
- >>> from gpsea.model.genome import GRCh38
- >>> chr1 = GRCh38.contig_by_name("chr1")
-
- The length of the reference allele of a missense variant is 1
- because the variant affects a 1-bp region spanned by the ``C`` nucleotide:
-
- >>> missense = VariantCoordinates.from_vcf_literal(chr1, 1001, 'C', 'T')
- >>> len(missense)
- 1
-
- The length of a "small" deletion is the same as the length of the ref allele `str`:
- (``'CCC'`` in the example below):
-
- >>> deletion = VariantCoordinates.from_vcf_literal(chr1, 1001, 'CCC', 'C')
- >>> len(deletion)
- 3
-
- This is because the literal notation spells out the alleles.
- However, this simple rule does not apply in symbolic notation.
- Here, the REF length corresponds to the length of the allele region.
-
- For instance, for the following structural variant
-
- .. code::
-
- #CHROM POS ID REF ALT QUAL FILTER INFO
- 1 1001 . C 6 PASS SVTYPE=DEL;END=1003;SVLEN=-3
-
- the length of the REF allele is `3`:
-
- >>> sv_deletion = VariantCoordinates.from_vcf_symbolic(
- ... chr1, pos=1001, end=1003,
- ... ref='C', alt='', svlen=-3,
- ... )
- >>> len(sv_deletion)
- 3
+ .. seealso::
- because the deletion removes 3 base pairs at the coordinates :math:`[1001, 1003]`.
+ See :ref:`length-of-the-reference-allele` for more info.
**Example**
@@ -291,13 +262,12 @@ def ref_length(
>>> predicate = VariantPredicates.ref_length('>', 5)
>>> predicate.get_question()
'ref allele length > 5'
-
+
Args:
operator: a `str` with the desired test. Must be one of ``{ '<', '<=', '==', '!=', '>=', '>' }``.
length: a non-negative `int` with the length threshold.
"""
return RefAlleleLengthPredicate(operator, length)
-
@staticmethod
def change_length(
@@ -310,11 +280,11 @@ def change_length(
.. seealso::
- See :meth:`gpsea.model.VariantCoordinates.change_length` for more info on change length.
+ See :ref:`change-length-of-an-allele` for more info.
**Example**
- Make a predicate for testing if the change length is less than or equal to `-10`,
+ Make a predicate for testing if the change length is less than or equal to `-10`,
e.g. to test if a variant is a *deletion* leading to removal of at least 10 base pairs:
>>> from gpsea.analysis.predicate.genotype import VariantPredicates
@@ -333,15 +303,15 @@ def is_structural_deletion(
threshold: int = -50,
) -> VariantPredicate:
"""
- Prepare a :class:`VariantPredicate` for testing if the variant
+ Prepare a :class:`VariantPredicate` for testing if the variant
is a `chromosomal deletion `_ or a structural variant deletion
that leads to removal of at least *n* base pairs (50bp by default).
-
+
.. note::
The predicate uses :meth:`~gpsea.model.VariantCoordinates.change_length`
to determine if the length of the variant is above or below `threshold`.
-
+
**IMPORTANT**: the change lengths of deletions are *negative*, since the alternate allele
is shorter than the reference allele. See the method's documentation for more info.
@@ -378,9 +348,7 @@ def __init__(
self._protein_metadata_service = protein_metadata_service
def protein_feature_type(
- self,
- feature_type: FeatureType,
- tx_id: str
+ self, feature_type: FeatureType, tx_id: str
) -> VariantPredicate:
"""
Prepare a :class:`VariantPredicate` that tests if the variant affects a protein feature type.
@@ -392,11 +360,7 @@ def protein_feature_type(
feature_type, tx_id, self._protein_metadata_service
)
- def protein_feature(
- self,
- feature_id: str,
- tx_id: str
- ) -> VariantPredicate:
+ def protein_feature(self, feature_id: str, tx_id: str) -> VariantPredicate:
"""
Prepare a :class:`VariantPredicate` that tests if the variant affects a protein feature type.
diff --git a/src/gpsea/analysis/predicate/phenotype/__init__.py b/src/gpsea/analysis/predicate/phenotype/__init__.py
index f87a9f92..2efa1e55 100644
--- a/src/gpsea/analysis/predicate/phenotype/__init__.py
+++ b/src/gpsea/analysis/predicate/phenotype/__init__.py
@@ -1,9 +1,19 @@
-from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate, CountingPhenotypeScorer
+"""
+The `gpsea.analysis.predicate.phenotype` package provides the :class:`PhenotypePolyPredicate`
+for assigning :class:`~gpsea.model.Patient` into a phenotype group.
+
+An individual can be assigned based on presence/absence of a disease diagnosis (:class:`DiseasePresencePredicate`)
+or using the phenotype features encoded into HPO terms (:class:`PropagatingPhenotypePredicate`).
+"""
+
+from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate
from ._pheno import DiseasePresencePredicate
from ._pheno import PhenotypeCategorization, P
+from ._util import prepare_predicates_for_terms_of_interest, prepare_hpo_terms_of_interest
__all__ = [
- 'PhenotypePolyPredicate', 'PropagatingPhenotypePredicate', 'CountingPhenotypeScorer',
+ 'PhenotypePolyPredicate', 'PropagatingPhenotypePredicate',
'DiseasePresencePredicate',
'PhenotypeCategorization', 'P',
+ 'prepare_predicates_for_terms_of_interest', 'prepare_hpo_terms_of_interest',
]
diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py
index 4b13f766..c2646f72 100644
--- a/src/gpsea/analysis/predicate/phenotype/_pheno.py
+++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py
@@ -69,9 +69,9 @@ class PhenotypePolyPredicate(
@property
@abc.abstractmethod
- def phenotypes(self) -> typing.Sequence[P]:
+ def phenotype(self) -> P:
"""
- Get the phenotype entities of interest.
+ Get the phenotype entity of interest.
"""
pass
@@ -97,7 +97,7 @@ def __init__(
self._query = hpotk.util.validate_instance(
query, hpotk.TermId, "phenotypic_feature"
)
- self._query_label = self._hpo.get_term(query)
+ self._query_label = self._hpo.get_term_name(query)
assert self._query_label is not None, f"Query {query} is in HPO"
self._missing_implies_phenotype_excluded = hpotk.util.validate_instance(
missing_implies_phenotype_excluded,
@@ -117,9 +117,8 @@ def get_question(self) -> str:
return f"Is {self._query_label} present in the patient?"
@property
- def phenotypes(self) -> typing.Sequence[hpotk.TermId]:
- # We usually test just a single HPO term, so we return a tuple with a single member.
- return (self._query,)
+ def phenotype(self) -> hpotk.TermId:
+ return self._query
def get_categorizations(
self,
@@ -205,9 +204,8 @@ def get_question(self) -> str:
return f"Was {self._query} diagnosed in the patient"
@property
- def phenotypes(self) -> typing.Sequence[hpotk.TermId]:
- # We usually test just a single Disease, so we return a tuple with a single member.
- return (self._query,)
+ def phenotype(self) -> hpotk.TermId:
+ return self._query
def get_categorizations(
self,
@@ -240,131 +238,3 @@ def test(
def __repr__(self):
return f"DiseasePresencePredicate(query={self._query})"
-
-
-class CountingPhenotypeScorer:
- """
- `CountingPhenotypeScorer` assigns the patient with a phenotype score
- that is equivalent to the count of present phenotypes that are either
- an exact match to the `query` terms or their descendants.
- For instance, we may want to count whether an individual has brain, liver, kidney, and skin abormalities.
- In the case, the query would include the corresponding terms (e.g., Abnormal brain morphology HP:0012443).
- An individual can then have between 0 and 4 phenotype group abnormalities.
- This predicate is intended to be used with the Mann Whitney U test.
-
- Example
- ^^^^^^^
- We first need to load HPO using HPO toolkit:
-
- >>> import hpotk
- >>> store = hpotk.configure_ontology_store()
- >>> hpo = store.load_minimal_hpo(release='v2023-10-09')
-
- Now we can create `CountingPhenotypeScorer` to test for presence of brain, liver, kidney and skin abnormalities:
-
- >>> from gpsea.analysis.predicate.phenotype import CountingPhenotypeScorer
- >>> phenotype_groups = (
- ... "HP:0012443", # Abnormal brain morphology
- ... "HP:0410042", # Abnormal liver morphology
- ... "HP:0012210", # Abnormal renal morphology
- ... "HP:0011121", # Abnormal skin morphology
- ... )
- >>> scorer = CountingPhenotypeScorer.from_query_curies(
- ... hpo=hpo,
- ... query=phenotype_groups,
- ... )
-
- """
-
- @staticmethod
- def from_query_curies(
- hpo: hpotk.MinimalOntology,
- query: typing.Iterable[typing.Union[str, hpotk.TermId]],
- ):
- """
- Create a scorer to test for the number of phenotype terms that fall into the phenotype groups.
-
- :param hpo: HPO as represented by :class:`~hpotk.ontology.MinimalOntology` of HPO toolkit.
- :param query: an iterable of the top-level terms, either represented as CURIEs (`str`)
- or as term IDs.
- """
- query_term_ids = set()
- for q in query:
- # First check if the query items are Term IDs or curies.
- if isinstance(q, str):
- q = hpotk.TermId.from_curie(q)
- elif isinstance(q, hpotk.TermId):
- pass
- else:
- raise ValueError(
- f"query argument must be iterable of hpotk TermId's or strings but we found {type(q)}"
- )
-
- # Now chack that the term IDs are HPO term IDs.
- if q not in hpo:
- raise ValueError(f"The query {q} was not found in the HPO")
- query_term_ids.add(q)
-
- if len(query_term_ids) == 0:
- raise ValueError("`query` must not be empty")
-
- # the query terms must not include a term and its ancestor
- for q in query_term_ids:
- for anc in hpo.graph.get_ancestors(q):
- if anc in query_term_ids:
- raise ValueError(
- f"Both {q} and its ancestor term {anc} were found in the query, "
- + "but query terms must not include a term and its ancestor"
- )
-
- return CountingPhenotypeScorer(
- hpo=hpo,
- query=query_term_ids,
- )
-
- def __init__(
- self,
- hpo: hpotk.MinimalOntology,
- query: typing.Iterable[hpotk.TermId],
- ):
- self._hpo = hpo
- self._query = set(query)
-
- def get_question(self) -> str:
- return "How many of the query HPO terms (or their descendants) does the individual display"
-
- def score(
- self,
- patient: Patient,
- ) -> float:
- """
- Get the count (number) of terms in the query set
- that have matching terms (exact matches or descendants) in the patient.
- Do not double count if the patient has two terms
- (e.g., two different descendants) of one of the query terms.
- """
- count = 0
- for q in self._query:
- for pf in patient.present_phenotypes():
- hpo_id = pf.identifier
- if hpo_id == q or any(
- anc == q for anc in self._hpo.graph.get_ancestors(hpo_id)
- ):
- count += 1
- # We break the inner loop to continue the outer.
- break
-
- # A sanity check - we cannot produce more counts than there are categories!
- assert 0 <= count <= len(self._query)
-
- return count
-
- def __call__(
- self,
- *args: typing.Any,
- **kwds: typing.Any,
- ) -> float:
- # TODO: move to `PhenotypeScorer` API.
- assert len(args) == 1 and isinstance(args[0], Patient), 'The first argument must be an instance of `Patient`'
- assert len(kwds) == 0, 'We do not take any key-word arguments'
- return self.score(args[0])
diff --git a/src/gpsea/analysis/predicate/phenotype/_util.py b/src/gpsea/analysis/predicate/phenotype/_util.py
new file mode 100644
index 00000000..ac73dd02
--- /dev/null
+++ b/src/gpsea/analysis/predicate/phenotype/_util.py
@@ -0,0 +1,88 @@
+import typing
+
+from collections import Counter
+
+import hpotk
+
+from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate
+
+from gpsea.model import Patient
+
+
+def prepare_predicates_for_terms_of_interest(
+ cohort: typing.Iterable[Patient],
+ hpo: hpotk.graph.GraphAware,
+ missing_implies_excluded: bool = False,
+ min_n_of_patients_with_term: int = 2,
+) -> typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]]:
+ """
+ A convenience method for creating a battery of :class:`PropagatingPhenotypePredicate` predicates
+ for testing all phenotypes of interest.
+
+ :param cohort: a cohort of individuals to investigate.
+ :param hpo: an entity with an HPO graph (e.g. :class:`~hpotk.MinimalOntology`).
+ :param missing_implies_excluded: `True` if absence of an annotation should be counted as its explicit exclusion.
+ :param min_n_of_patients_with_term: the minimum number of patients that must feature an HPO term
+ (either directly or indirectly) for the term to be included in the analysis.
+ """
+ return tuple(
+ PropagatingPhenotypePredicate(
+ hpo=hpo,
+ query=term,
+ missing_implies_phenotype_excluded=missing_implies_excluded,
+ ) for term in prepare_hpo_terms_of_interest(
+ cohort, hpo, min_n_of_patients_with_term,
+ )
+ )
+
+
+def prepare_hpo_terms_of_interest(
+ cohort: typing.Iterable[Patient],
+ hpo: hpotk.graph.GraphAware,
+ min_n_of_patients_with_term: int = 2,
+) -> typing.Sequence[hpotk.TermId]:
+ """
+ Prepare a collection of HPO terms to test.
+
+ This includes the direct HPO patient annotations
+ as well as the ancestors of the present terms and the descendants of the excluded terms.
+
+ :param cohort: a cohort of individuals to investigate.
+ :param hpo: an entity with an HPO graph (e.g. :class:`~hpotk.MinimalOntology`).
+ :param min_n_of_patients_with_term: the minimum number of patients that must feature an HPO term
+ (either directly or indirectly) for the term to be included in the analysis.
+ """
+ present_count = Counter()
+ excluded_count = Counter()
+
+ for patient in cohort:
+ for pf in patient.phenotypes:
+ if pf.is_present:
+ # A present phenotypic feature must be counted in.
+ present_count[pf.identifier] += 1
+ # and it also implies presence of its ancestors.
+ for anc in hpo.graph.get_ancestors(pf):
+ present_count[anc] += 1
+ else:
+ # An excluded phenotypic feature
+ excluded_count[pf.identifier] += 1
+ for desc in hpo.graph.get_descendants(pf):
+ # implies exclusion of its descendants.
+ excluded_count[desc] += 1
+
+ total_count = Counter()
+ for term_id, count in present_count.items():
+ total_count[term_id] += count
+
+ for term_id, count in excluded_count.items():
+ total_count[term_id] += count
+
+ final_hpo = []
+ for term_id in present_count:
+ # Keep the term if it is mentioned at least *n* times (incl. being excluded)
+ # in the cohort
+ n_all = total_count[term_id]
+ if n_all >= min_n_of_patients_with_term:
+ final_hpo.append(term_id)
+
+ return tuple(final_hpo)
diff --git a/src/gpsea/analysis/pscore/__init__.py b/src/gpsea/analysis/pscore/__init__.py
new file mode 100644
index 00000000..d03f6fd9
--- /dev/null
+++ b/src/gpsea/analysis/pscore/__init__.py
@@ -0,0 +1,7 @@
+from ._api import PhenotypeScorer, PhenotypeScoreAnalysis, PhenotypeScoreAnalysisResult
+from ._impl import CountingPhenotypeScorer
+
+__all__ = [
+ 'PhenotypeScorer', 'PhenotypeScoreAnalysis', 'PhenotypeScoreAnalysisResult',
+ 'CountingPhenotypeScorer',
+]
diff --git a/src/gpsea/analysis/pscore/_api.py b/src/gpsea/analysis/pscore/_api.py
new file mode 100644
index 00000000..a2899734
--- /dev/null
+++ b/src/gpsea/analysis/pscore/_api.py
@@ -0,0 +1,162 @@
+import abc
+import typing
+
+import pandas as pd
+
+from gpsea.model import Patient
+from ..predicate.genotype import GenotypePolyPredicate
+from .stats import PhenotypeScoreStatistic
+
+
+class PhenotypeScorer(metaclass=abc.ABCMeta):
+ """
+ `PhenotypeScorer` assigns the patient with a phenotype score.
+ """
+
+ def score(self, patient: Patient) -> float:
+ """
+ Compute the score for the `patient`.
+ """
+ pass
+
+
+class PhenotypeScoreAnalysisResult:
+ """
+ `PhenotypeScoreAnalysisResult` is a container for :class:`PhenotypeScoreAnalysis` results.
+ """
+
+ def __init__(
+ self,
+ genotype_phenotype_scores: pd.DataFrame,
+ pval: float,
+ ):
+ self._genotype_phenotype_scores = genotype_phenotype_scores
+ self._pval = float(pval)
+
+ @property
+ def genotype_phenotype_scores(self) -> pd.DataFrame:
+ """
+ Get the DataFrame with the genotype group and the phenotype score for each patient.
+
+ The DataFrame has the following structure:
+
+ ========== ======== =========
+ patient_id genotype phenotype
+ ========== ======== =========
+ patient_1 0 1
+ patient_2 0 3
+ patient_3 None 2
+ patient_4 1 2
+ ... ... ...
+ ========== ======== =========
+
+ The DataFrame index includes the patient IDs, and then there are 2 columns
+ with the `genotype` group id (:attr:`~gpsea.analysis.predicate.PatientCategory.cat_id`)
+ and the `phenotype` score. A `genotype` value may be missing if the patient
+ cannot be assigned into any genotype category.
+ """
+ return self._genotype_phenotype_scores
+
+ @property
+ def pval(self) -> float:
+ """
+ Get the p value of the test.
+ """
+ return self._pval
+
+ def plot_boxplots(
+ self,
+ gt_predicate: GenotypePolyPredicate,
+ ax,
+ colors=["darksalmon", "honeydew"],
+ ):
+ """
+ Draw box plots with distributions of phenotype scores for genotype groups
+ """
+ # skip the patients with unassigned genotype group
+ not_na_gts = self._genotype_phenotype_scores["genotype"].notna()
+ data = self._genotype_phenotype_scores.loc[not_na_gts]
+ actual = set(data["genotype"].unique())
+ expected = gt_predicate.get_categorizations()
+ x = [
+ data.loc[data["genotype"] == c.category.cat_id, "phenotype"].to_list()
+ for c in gt_predicate.get_categorizations()
+ ]
+
+ gt_cat_names = [
+ c.category.name for c in gt_predicate.get_categorizations()
+ ]
+ bplot = ax.boxplot(
+ x=x,
+ patch_artist=True,
+ tick_labels=gt_cat_names,
+ )
+
+ for patch, color in zip(bplot["boxes"], colors):
+ patch.set_facecolor(color)
+
+
+class PhenotypeScoreAnalysis:
+ """
+ `PhenotypeScoreAnalysis` tests the association between two or more genotype groups
+ and a phenotype score.
+
+ The genotype groups are created by a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`
+ and the phenotype score is computed with :class:`~gpsea.analysis.pscore.PhenotypeScorer`.
+
+ The association is tested with a :class:`~gpsea.analysis.pscore.PhenotypeScoreStatistic`
+ and the results are reported as a :class:`PhenotypeScoreAnalysisResult`.
+ """
+
+ def __init__(
+ self,
+ score_statistic: PhenotypeScoreStatistic,
+ ):
+ assert isinstance(score_statistic, PhenotypeScoreStatistic)
+ self._statistic = score_statistic
+
+ def compare_genotype_vs_phenotype_score(
+ self,
+ cohort: typing.Iterable[Patient],
+ gt_predicate: GenotypePolyPredicate,
+ pheno_scorer: PhenotypeScorer,
+ ) -> PhenotypeScoreAnalysisResult:
+ """
+ Compute the association between genotype groups and phenotype score.
+
+ :param cohort: the cohort to analyze.
+ :param gt_predicate: a predicate for assigning an individual into a genotype group.
+ :param pheno_scorer: the scorer to compute phenotype score.
+ """
+ assert (
+ gt_predicate.n_categorizations() == 2
+ ), "We only support 2 genotype categories at this point"
+
+ idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id")
+ data = pd.DataFrame(
+ None,
+ index=idx,
+ columns=["genotype", "phenotype"],
+ )
+
+ # Apply the predicates on the patients
+ for patient in cohort:
+ gt_cat = gt_predicate.test(patient)
+ if gt_cat is None:
+ data.loc[patient.patient_id, "genotype"] = None
+ else:
+ data.loc[patient.patient_id, "genotype"] = gt_cat.category.cat_id
+
+ data.loc[patient.patient_id, "phenotype"] = pheno_scorer.score(patient)
+
+ # Sort by PatientCategory.cat_id and unpack.
+ # For now, we only allow to have up to 2 groups.
+ x_key, y_key = sorted(data["genotype"].dropna().unique())
+ x = data.loc[data["genotype"] == x_key, "phenotype"].to_numpy(dtype=float)
+ y = data.loc[data["genotype"] == y_key, "phenotype"].to_numpy(dtype=float)
+ pval = self._statistic.compute_pval(scores=(x, y))
+
+ return PhenotypeScoreAnalysisResult(
+ genotype_phenotype_scores=data,
+ pval=pval,
+ )
diff --git a/src/gpsea/analysis/pscore/_impl.py b/src/gpsea/analysis/pscore/_impl.py
new file mode 100644
index 00000000..b2ce8bd2
--- /dev/null
+++ b/src/gpsea/analysis/pscore/_impl.py
@@ -0,0 +1,135 @@
+import typing
+
+import hpotk
+
+from gpsea.model import Patient
+from ._api import PhenotypeScorer
+
+
+class CountingPhenotypeScorer(PhenotypeScorer):
+ """
+ `CountingPhenotypeScorer` assigns the patient with a phenotype score
+ that is equivalent to the count of present phenotypes that are either
+ an exact match to the `query` terms or their descendants.
+
+ For instance, we may want to count whether an individual has brain, liver, kidney, and skin abormalities.
+ In the case, the query would include the corresponding terms (e.g., Abnormal brain morphology HP:0012443).
+ An individual can then have between 0 and 4 phenotype group abnormalities.
+ This predicate is intended to be used with the Mann Whitney U test.
+
+ Example
+ ^^^^^^^
+ We first need to load HPO using HPO toolkit:
+
+ >>> import hpotk
+ >>> store = hpotk.configure_ontology_store()
+ >>> hpo = store.load_minimal_hpo(release='v2024-07-01')
+
+ Now we can create `CountingPhenotypeScorer` to test for presence of brain, liver, kidney and skin abnormalities:
+
+ >>> from gpsea.analysis.pscore import CountingPhenotypeScorer
+ >>> phenotype_groups = (
+ ... "HP:0012443", # Abnormal brain morphology
+ ... "HP:0410042", # Abnormal liver morphology
+ ... "HP:0012210", # Abnormal renal morphology
+ ... "HP:0011121", # Abnormal skin morphology
+ ... )
+ >>> scorer = CountingPhenotypeScorer.from_query_curies(
+ ... hpo=hpo,
+ ... query=phenotype_groups,
+ ... )
+
+ """
+
+ @staticmethod
+ def from_query_curies(
+ hpo: hpotk.MinimalOntology,
+ query: typing.Iterable[typing.Union[str, hpotk.TermId]],
+ ):
+ """
+ Create a scorer to test for the number of phenotype terms that fall into the phenotype groups.
+
+ :param hpo: HPO as represented by :class:`~hpotk.ontology.MinimalOntology` of HPO toolkit.
+ :param query: an iterable of the top-level terms, either represented as CURIEs (`str`)
+ or as term IDs.
+ """
+ query_term_ids = set()
+ for q in query:
+ # First check if the query items are Term IDs or curies.
+ if isinstance(q, str):
+ q = hpotk.TermId.from_curie(q)
+ elif isinstance(q, hpotk.TermId):
+ pass
+ else:
+ raise ValueError(
+ f"query argument must be iterable of hpotk TermId's or strings but we found {type(q)}"
+ )
+
+ # Now chack that the term IDs are HPO term IDs.
+ if q not in hpo:
+ raise ValueError(f"The query {q} was not found in the HPO")
+ query_term_ids.add(q)
+
+ if len(query_term_ids) == 0:
+ raise ValueError("`query` must not be empty")
+
+ # the query terms must not include a term and its ancestor
+ for q in query_term_ids:
+ for anc in hpo.graph.get_ancestors(q):
+ if anc in query_term_ids:
+ raise ValueError(
+ f"Both {q} and its ancestor term {anc} were found in the query, "
+ + "but query terms must not include a term and its ancestor"
+ )
+
+ return CountingPhenotypeScorer(
+ hpo=hpo,
+ query=query_term_ids,
+ )
+
+ def __init__(
+ self,
+ hpo: hpotk.MinimalOntology,
+ query: typing.Iterable[hpotk.TermId],
+ ):
+ self._hpo = hpo
+ self._query = set(query)
+
+ def get_question(self) -> str:
+ return "How many of the query HPO terms (or their descendants) does the individual display"
+
+ def score(
+ self,
+ patient: Patient,
+ ) -> float:
+ """
+ Get the count (number) of terms in the query set
+ that have matching terms (exact matches or descendants) in the patient.
+ Do not double count if the patient has two terms
+ (e.g., two different descendants) of one of the query terms.
+ """
+ count = 0
+ for q in self._query:
+ for pf in patient.present_phenotypes():
+ hpo_id = pf.identifier
+ if hpo_id == q or any(
+ anc == q for anc in self._hpo.graph.get_ancestors(hpo_id)
+ ):
+ count += 1
+ # We break the inner loop to continue the outer.
+ break
+
+ # A sanity check - we cannot produce more counts than there are categories!
+ assert 0 <= count <= len(self._query)
+
+ return count
+
+ # def __call__(
+ # self,
+ # *args: typing.Any,
+ # **kwds: typing.Any,
+ # ) -> float:
+ # # TODO: move to `PhenotypeScorer` API.
+ # assert len(args) == 1 and isinstance(args[0], Patient), 'The first argument must be an instance of `Patient`'
+ # assert len(kwds) == 0, 'We do not take any key-word arguments'
+ # return self.score(args[0])
diff --git a/src/gpsea/analysis/pscore/stats/__init__.py b/src/gpsea/analysis/pscore/stats/__init__.py
new file mode 100644
index 00000000..e46bfb5e
--- /dev/null
+++ b/src/gpsea/analysis/pscore/stats/__init__.py
@@ -0,0 +1,7 @@
+from ._stats import PhenotypeScoreStatistic
+from ._stats import MannWhitneyStatistic
+
+__all__ = [
+ 'PhenotypeScoreStatistic',
+ 'MannWhitneyStatistic',
+]
diff --git a/src/gpsea/analysis/pscore/stats/_stats.py b/src/gpsea/analysis/pscore/stats/_stats.py
new file mode 100644
index 00000000..01cfa4d9
--- /dev/null
+++ b/src/gpsea/analysis/pscore/stats/_stats.py
@@ -0,0 +1,44 @@
+import abc
+import typing
+
+from scipy.stats import mannwhitneyu
+
+
+class PhenotypeScoreStatistic(metaclass=abc.ABCMeta):
+ """
+ `PhenotypeScoreStatistic` calculates a p value
+ for 2 or more phenotype score groups
+ computed by a :class:`~gpsea.analysis.pscore.PhenotypeScorer`.
+ """
+
+ @abc.abstractmethod
+ def compute_pval(
+ self,
+ scores: typing.Collection[typing.Sequence[float]],
+ ) -> float:
+ pass
+
+
+class MannWhitneyStatistic(PhenotypeScoreStatistic):
+ """
+ `MannWhitneyStatistic` is a wrapper around SciPy's
+ :func:`~scipy.stats.mannwhitneyu` function to apply
+ Mann-Whitney U rank test on 2 phenotype scores.
+
+ See :ref:`phenotype-score-stats` for an example usage.
+ """
+
+ def compute_pval(
+ self,
+ scores: typing.Collection[typing.Sequence[float]],
+ ) -> float:
+ assert len(scores) == 2, 'Mann-Whitney U rank test only supports 2 categories at this time'
+
+ x, y = scores
+ _, pval = mannwhitneyu(
+ x=x,
+ y=y,
+ alternative='two-sided',
+ )
+
+ return pval
diff --git a/src/gpsea/model/_cohort.py b/src/gpsea/model/_cohort.py
index 270ebc03..3209748e 100644
--- a/src/gpsea/model/_cohort.py
+++ b/src/gpsea/model/_cohort.py
@@ -107,7 +107,7 @@ def __hash__(self) -> int:
return hash((self._labels, self._variants, self._phenotypes, self._diseases))
-class Cohort(typing.Sized):
+class Cohort(typing.Sized, typing.Iterable[Patient]):
@staticmethod
def from_patients(
@@ -141,7 +141,7 @@ def __init__(
members: typing.Iterable[Patient],
excluded_member_count: int,
):
- self._patient_set = frozenset(members)
+ self._members = tuple(set(members))
self._excluded_count = excluded_member_count
@property
@@ -149,25 +149,25 @@ def all_patients(self) -> typing.Collection[Patient]:
"""
Get a collection of all patients in the cohort.
"""
- return self._patient_set
+ return self._members
def all_phenotypes(self) -> typing.Set[Phenotype]:
"""
Get a set of all phenotypes (observed or excluded) in the cohort members.
"""
- return set(itertools.chain(phenotype for patient in self._patient_set for phenotype in patient.phenotypes))
+ return set(itertools.chain(phenotype for patient in self._members for phenotype in patient.phenotypes))
def all_diseases(self) -> typing.Set[Disease]:
"""
Get a set of all diseases (observed or excluded) in the cohort members.
"""
- return set(itertools.chain(disease for patient in self._patient_set for disease in patient.diseases))
+ return set(itertools.chain(disease for patient in self._members for disease in patient.diseases))
def all_variants(self) -> typing.Set[Variant]:
"""
Get a set of all variants observed in the cohort members.
"""
- return set(itertools.chain(variant for patient in self._patient_set for variant in patient.variants))
+ return set(itertools.chain(variant for patient in self._members for variant in patient.variants))
@property
def all_transcript_ids(self) -> typing.Set[str]:
@@ -181,13 +181,13 @@ def total_patient_count(self):
"""
Get the total number of cohort members.
"""
- return len(self._patient_set)
+ return len(self._members)
def get_patient_ids(self) -> typing.Set[str]:
"""
Get a set of the patient IDs.
"""
- return set(pat.patient_id for pat in self._patient_set)
+ return set(pat.patient_id for pat in self._members)
def list_present_phenotypes(
self,
@@ -205,7 +205,7 @@ def list_present_phenotypes(
number of patients with that phenotype)
"""
counter = Counter()
- for patient in self._patient_set:
+ for patient in self._members:
counter.update(p.identifier.value for p in patient.phenotypes if p.is_present)
return counter.most_common(top)
@@ -214,7 +214,7 @@ def list_all_diseases(
top=None,
) -> typing.Sequence[typing.Tuple[hpotk.TermId, int]]:
counter = Counter()
- for patient in self._patient_set:
+ for patient in self._members:
counter.update(d.identifier for d in patient.diseases)
return counter.most_common(top)
@@ -230,7 +230,7 @@ def list_all_variants(
list: A sequence of tuples, formatted (variant key, number of patients with that variant)
"""
counter = Counter()
- for patient in self._patient_set:
+ for patient in self._members:
counter.update(variant.variant_info.variant_key for variant in patient.variants)
return counter.most_common(top)
@@ -246,7 +246,7 @@ def list_all_proteins(
list: A list of tuples, formatted (protein ID string, the count of variants that affect the protein)
"""
counter = Counter()
- for patient in self._patient_set:
+ for patient in self._members:
counter.update(txa.protein_id for variant in patient.variants for txa in variant.tx_annotations)
return counter.most_common(top)
@@ -287,13 +287,16 @@ def get_variant_by_key(self, variant_key) -> Variant:
raise ValueError(f"Variant key {variant_key} not found in cohort.")
def __eq__(self, other):
- return isinstance(other, Cohort) and self._patient_set == other._patient_set
+ return isinstance(other, Cohort) and self._members == other._members
+
+ def __iter__(self) -> typing.Iterator[Patient]:
+ return iter(self._members)
def __len__(self) -> int:
- return len(self._patient_set)
+ return len(self._members)
def __repr__(self):
- return f'Cohort(members={self._patient_set}, excluded_count={self._excluded_count})'
+ return f'Cohort(members={self._members}, excluded_count={self._excluded_count})'
def __str__(self):
return repr(self)
diff --git a/src/gpsea/model/_gt.py b/src/gpsea/model/_gt.py
index 354d4b23..1940adbb 100644
--- a/src/gpsea/model/_gt.py
+++ b/src/gpsea/model/_gt.py
@@ -41,9 +41,11 @@ class Genotypes(typing.Sized, typing.Iterable):
>>> b = SampleLabels('B')
We can use one of the static methods to create an instance. Either a single genotype:
+
>>> gt = Genotypes.single(a, Genotype.HETEROZYGOUS)
- Or genotypes of several samples:
+ or genotypes of several samples:
+
>>> gts = Genotypes.from_mapping({a: Genotype.HETEROZYGOUS, b: Genotype.HOMOZYGOUS_ALTERNATE})
There are 2 genotypes in the container:
diff --git a/src/gpsea/model/_variant.py b/src/gpsea/model/_variant.py
index e73e818e..ebc8ae07 100644
--- a/src/gpsea/model/_variant.py
+++ b/src/gpsea/model/_variant.py
@@ -439,8 +439,7 @@ def change_length(self) -> int:
"""
Get the change of length between the `ref` and `alt` alleles due to the variant presence.
- SNVs lead to change length of zero, deletions and insertions/duplications lead to negative
- and positive change lengths, respectively.
+ See :ref:`change-length-of-an-allele` for more info.
"""
return self._change_length
@@ -869,9 +868,8 @@ def __init__(
@property
def variant_info(self) -> VariantInfo:
"""
- Returns:
- VariantInfo: A representation of the variant data for sequence and symbolic variants,
- as well as for large imprecise SVs.
+ Get the representation of the variant data for sequence and symbolic variants,
+ as well as for large imprecise SVs.
"""
return self._variant_info
diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py
index 91fc7856..67c14eab 100644
--- a/src/gpsea/preprocessing/_config.py
+++ b/src/gpsea/preprocessing/_config.py
@@ -359,7 +359,7 @@ def load_phenopackets(
"""
Map phenopacket JSON file into patient, validate the patient data, and assemble the patients into a cohort.
- :param pp_directory: path to a folder with phenopacket JSON files. An error is raised if the path does not point to
+ :param phenopackets: path to a folder with phenopacket JSON files. An error is raised if the path does not point to
a directory with at least one phenopacket.
:param cohort_creator: cohort creator for turning a sequence of phenopacket
into a :class:`~gpsea.model.Cohort`.
diff --git a/src/gpsea/view/_stats.py b/src/gpsea/view/_stats.py
index a7bdb1af..b232ad15 100644
--- a/src/gpsea/view/_stats.py
+++ b/src/gpsea/view/_stats.py
@@ -1,7 +1,10 @@
import typing
+from collections import Counter
+
from jinja2 import Environment, PackageLoader
-from gpsea.analysis import HpoMtcReport
+
+from gpsea.analysis.pcats import HpoTermAnalysisResult
class MtcStatsViewer:
@@ -16,45 +19,47 @@ def __init__(self):
def process(
self,
- hpo_mtc_report: HpoMtcReport,
+ result: HpoTermAnalysisResult,
) -> str:
"""
- Create an HTML that should be shown with `display(HTML(..))` of the IPython package.
+ Create an HTML to present MTC part of the :class:`HpoTermAnalysisResult`.
+
+ Use the `display(HTML(..))` functions of the IPython package.
Args:
- hpo_mtc_report (HpoMtcReport): summary of heuristic term filtering procedure
+ result (HpoTermAnalysisResult): the result to show
Returns:
str: an HTML string with parameterized template for rendering or writing into a standalone HTML file.
"""
- context = self._prepare_context(hpo_mtc_report)
+ assert isinstance(result, HpoTermAnalysisResult)
+ context = self._prepare_context(result)
return self._cohort_template.render(context)
@staticmethod
def _prepare_context(
- hpo_mtc_report: HpoMtcReport,
+ report: HpoTermAnalysisResult,
) -> typing.Mapping[str, typing.Any]:
- results_map = hpo_mtc_report.skipped_terms_dict
- if not isinstance(results_map, dict):
- raise ValueError(
- "hpo_mtc_report.skipped_terms_dict must be dictionary "
- f"but was {type(hpo_mtc_report.skipped_terms_dict)}"
- )
+ counts = Counter()
+ for result in report.mtc_filter_results:
+ if result.is_filtered_out():
+ counts[result.reason] += 1
n_skipped = 0
reason_to_count = list()
- for reason, count in sorted(results_map.items(), key=lambda item: item[1], reverse=True):
+ for reason, count in sorted(counts.items(), key=lambda item: item[1], reverse=True):
reason_to_count.append({"reason": reason, "count": count})
n_skipped += count
- n_tested = hpo_mtc_report.n_terms_before_filtering - n_skipped
+ n_all = len(report.phenotypes)
+ n_tested = n_all - n_skipped
# The following dictionary is used by the Jinja2 HTML template
return {
- "mtc_name": hpo_mtc_report.mtc_method,
- "hpo_mtc_filter_name": hpo_mtc_report.filter_method,
+ "mtc_name": report.mtc_name,
+ "hpo_mtc_filter_name": report.mtc_filter_name,
"skipped_hpo_count": n_skipped,
"tested_hpo_count": n_tested,
- "total_hpo_count": hpo_mtc_report.n_terms_before_filtering,
+ "total_hpo_count": n_all,
"reason_to_count": reason_to_count,
}
diff --git a/tests/analysis/pcats/__init__.py b/tests/analysis/pcats/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/analysis/pcats/test_disease.py b/tests/analysis/pcats/test_disease.py
new file mode 100644
index 00000000..48acece3
--- /dev/null
+++ b/tests/analysis/pcats/test_disease.py
@@ -0,0 +1,50 @@
+import hpotk
+import pytest
+
+from gpsea.model import Cohort
+
+from gpsea.analysis.pcats import DiseaseAnalysis
+from gpsea.analysis.pcats.stats import CountStatistic, ScipyFisherExact
+from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
+from gpsea.analysis.predicate.phenotype import DiseasePresencePredicate
+
+
+class TestDiseaseAnalysis:
+
+ @pytest.fixture(scope='class')
+ def count_statistic(self) -> CountStatistic:
+ return ScipyFisherExact()
+
+ @pytest.fixture(scope='class')
+ def suox_disease(self) -> DiseasePresencePredicate:
+ sulfite_oxidase_deficiency = hpotk.TermId.from_curie('OMIM:272300')
+ return DiseasePresencePredicate(
+ disease_id_query=sulfite_oxidase_deficiency,
+ )
+
+ @pytest.fixture
+ def analysis(
+ self,
+ count_statistic: CountStatistic,
+ ) -> DiseaseAnalysis:
+ return DiseaseAnalysis(
+ count_statistic=count_statistic,
+ mtc_correction='fdr_bh',
+ mtc_alpha=.05,
+ )
+
+ def test_compare_genotype_vs_phenotypes(
+ self,
+ analysis: DiseaseAnalysis,
+ suox_cohort: Cohort,
+ suox_gt_predicate: GenotypePolyPredicate,
+ suox_disease: DiseasePresencePredicate,
+ ):
+ results = analysis.compare_genotype_vs_phenotypes(
+ cohort=suox_cohort.all_patients,
+ gt_predicate=suox_gt_predicate,
+ pheno_predicates=(suox_disease,),
+ )
+
+ assert results is not None
+ # TODO: improve testing
diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py
new file mode 100644
index 00000000..516bf9fc
--- /dev/null
+++ b/tests/analysis/pcats/test_hpo_term_analysis.py
@@ -0,0 +1,58 @@
+import typing
+
+import hpotk
+import pytest
+
+from gpsea.model import Cohort
+
+from gpsea.analysis.mtc_filter import PhenotypeMtcFilter, HpoMtcFilter
+from gpsea.analysis.pcats import HpoTermAnalysis
+from gpsea.analysis.pcats.stats import CountStatistic, ScipyFisherExact
+from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
+from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate
+
+
+class TestHpoTermAnalysis:
+
+ @pytest.fixture(scope='class')
+ def count_statistic(self) -> CountStatistic:
+ return ScipyFisherExact()
+
+ @pytest.fixture(scope='class')
+ def phenotype_mtc_filter(
+ self,
+ hpo: hpotk.MinimalOntology,
+ ) -> PhenotypeMtcFilter:
+ return HpoMtcFilter.default_filter(
+ hpo=hpo,
+ term_frequency_threshold=.2,
+ )
+
+ @pytest.fixture
+ def analysis(
+ self,
+ count_statistic: CountStatistic,
+ phenotype_mtc_filter: PhenotypeMtcFilter,
+ ) -> HpoTermAnalysis:
+ return HpoTermAnalysis(
+ count_statistic=count_statistic,
+ mtc_filter=phenotype_mtc_filter,
+ mtc_correction='fdr_bh',
+ mtc_alpha=.05,
+ )
+
+ def test_compare_genotype_vs_phenotypes(
+ self,
+ analysis: HpoTermAnalysis,
+ suox_cohort: Cohort,
+ suox_gt_predicate: GenotypePolyPredicate,
+ suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]]
+ ):
+ results = analysis.compare_genotype_vs_phenotypes(
+ cohort=suox_cohort.all_patients,
+ gt_predicate=suox_gt_predicate,
+ pheno_predicates=suox_pheno_predicates,
+ )
+
+ assert results is not None
+ # TODO: improve testing
diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py
index eb8a0ca6..3e71283b 100644
--- a/tests/analysis/predicate/genotype/test_gt_predicates.py
+++ b/tests/analysis/predicate/genotype/test_gt_predicates.py
@@ -1,8 +1,8 @@
import pytest
from gpsea.model import *
-from gpsea.analysis.predicate import GenotypePolyPredicate
from gpsea.analysis.predicate.genotype import (
+ GenotypePolyPredicate,
groups_predicate,
VariantPredicates,
)
diff --git a/tests/analysis/test_analysis.py b/tests/analysis/test_analysis.py
index a6888300..5db24621 100644
--- a/tests/analysis/test_analysis.py
+++ b/tests/analysis/test_analysis.py
@@ -14,9 +14,9 @@
from gpsea.model import *
from gpsea.model.genome import *
-from gpsea.analysis.predicate import GenotypePolyPredicate, PatientCategories
+from gpsea.analysis.predicate import PatientCategories
+from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicate
from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate
-from gpsea.analysis.predicate.genotype import VariantPredicate
def test_apply_predicates_on_patients(
diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py
index 1eb4d8e8..0b060516 100644
--- a/tests/analysis/test_mtc_filter.py
+++ b/tests/analysis/test_mtc_filter.py
@@ -5,8 +5,10 @@
import pandas as pd
import pytest
-from gpsea.analysis import HpoMtcFilter, SpecifiedTermsMtcFilter, apply_predicates_on_patients
-from gpsea.analysis.predicate import PatientCategories, GenotypePolyPredicate
+from gpsea.analysis import apply_predicates_on_patients
+from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter
+from gpsea.analysis.predicate import PatientCategories
+from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate
from gpsea.model import Cohort
@@ -154,7 +156,7 @@ def test_filter_terms_to_test(
filtered_n_usable, filtered_all_counts, reason_for_filtering_out = mtc_report
assert reason_for_filtering_out['Skipping term because all genotypes have same HPO observed proportions'] == 1
- assert reason_for_filtering_out['Skipping general term'] == 14
+ assert reason_for_filtering_out['Skipping general term'] == 16
assert reason_for_filtering_out['Skipping non-target term'] == 5
assert reason_for_filtering_out['Skipping top level term'] == 0
diff --git a/tests/analysis/predicate/phenotype/test_predicate.py b/tests/analysis/test_pscore.py
similarity index 97%
rename from tests/analysis/predicate/phenotype/test_predicate.py
rename to tests/analysis/test_pscore.py
index 345fa016..9b632902 100644
--- a/tests/analysis/predicate/phenotype/test_predicate.py
+++ b/tests/analysis/test_pscore.py
@@ -4,7 +4,7 @@
import pytest
from gpsea.model import Patient, Phenotype, SampleLabels
-from gpsea.analysis.predicate.phenotype import CountingPhenotypeScorer
+from gpsea.analysis.pscore import CountingPhenotypeScorer
class TestCountingPhenotypeScorer:
diff --git a/tests/conftest.py b/tests/conftest.py
index c3d6387b..54f012ec 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,8 +5,7 @@
import hpotk
import pytest
-from gpsea.analysis.predicate import GenotypePolyPredicate
-from gpsea.analysis.predicate.genotype import VariantPredicates, VariantPredicate, boolean_predicate
+from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, boolean_predicate
from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate
from gpsea.io import GpseaJSONEncoder, GpseaJSONDecoder
from gpsea.model import *
diff --git a/tests/test_tutorial.py b/tests/test_tutorial.py
new file mode 100644
index 00000000..52924eff
--- /dev/null
+++ b/tests/test_tutorial.py
@@ -0,0 +1,90 @@
+import pytest
+import ppktstore
+import hpotk
+
+from gpsea.model import Cohort, VariantEffect
+from gpsea.preprocessing import configure_caching_cohort_creator, CohortCreator, load_phenopackets
+from gpsea.analysis.pcats import HpoTermAnalysis
+from gpsea.analysis.mtc_filter import HpoMtcFilter
+from gpsea.analysis.pcats.stats import ScipyFisherExact
+from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate
+from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest
+
+
+class TestTutorial:
+
+ @pytest.fixture(scope='class')
+ def cohort_creator(
+ self,
+ hpo: hpotk.MinimalOntology,
+ ) -> CohortCreator:
+ return configure_caching_cohort_creator(
+ hpo=hpo,
+ )
+
+ @pytest.fixture(scope='class')
+ def cohort(
+ self,
+ cohort_creator: CohortCreator,
+ ) -> Cohort:
+ registry = ppktstore.registry.configure_phenopacket_registry()
+ with registry.open_phenopacket_store('0.1.18') as ps:
+ phenopackets = tuple(ps.iter_cohort_phenopackets('TBX5'))
+
+ cohort, _ = load_phenopackets(
+ phenopackets=phenopackets,
+ cohort_creator=cohort_creator,
+ )
+ return cohort
+
+ @pytest.fixture
+ def mtc_filter(
+ self,
+ hpo: hpotk.MinimalOntology,
+ ) -> HpoMtcFilter:
+ return HpoMtcFilter.default_filter(
+ hpo=hpo,
+ term_frequency_threshold=0.2,
+ )
+
+ @pytest.fixture
+ def hpo_term_analysis(
+ self,
+ mtc_filter,
+ ) -> HpoTermAnalysis:
+ return HpoTermAnalysis(
+ count_statistic=ScipyFisherExact(),
+ mtc_filter=mtc_filter,
+ mtc_correction='fdr_bh',
+ mtc_alpha=0.05,
+ )
+
+ @pytest.mark.skip('Just for manual debugging')
+ def test_compare_genotype_vs_phenotype(
+ self,
+ hpo: hpotk.MinimalOntology,
+ cohort: Cohort,
+ hpo_term_analysis: HpoTermAnalysis,
+ ):
+ tx_id = 'NM_181486.4'
+
+ gt_predicate = groups_predicate(
+ predicates=(
+ VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id),
+ VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id),
+ ),
+ group_names=('Missense', 'Frameshift',),
+ )
+ pheno_predicates = prepare_predicates_for_terms_of_interest(
+ cohort=cohort,
+ hpo=hpo,
+ min_n_of_patients_with_term=2,
+ )
+
+ result = hpo_term_analysis.compare_genotype_vs_phenotypes(
+ cohort=cohort,
+ gt_predicate=gt_predicate,
+ pheno_predicates=pheno_predicates,
+ )
+
+ assert result is not None
diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py
index 0cdacc3a..691f4f6a 100644
--- a/tests/view/test_stats.py
+++ b/tests/view/test_stats.py
@@ -1,11 +1,52 @@
+import hpotk
+import math
import pytest
-from gpsea.analysis import HpoMtcReport
+import pandas as pd
+
+from gpsea.analysis.pcats import HpoTermAnalysisResult
+from gpsea.analysis.predicate import PatientCategories
+from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
+from gpsea.analysis.mtc_filter import PhenotypeMtcResult
from gpsea.view import MtcStatsViewer
class TestStatsViewable:
+ @pytest.fixture(scope='class')
+ def hpo_term_analysis_result(
+ self,
+ suox_gt_predicate: GenotypePolyPredicate,
+ ) -> HpoTermAnalysisResult:
+ return HpoTermAnalysisResult(
+ phenotypes=(
+ hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly
+ hpotk.TermId.from_curie('HP:0001250'), # Seizure
+ ),
+ n_usable=(40, 20),
+ all_counts=(
+ pd.DataFrame(
+ data=[[10, 5], [10, 15]],
+ index=pd.Index((PatientCategories.YES, PatientCategories.NO,)),
+ columns=pd.Index(suox_gt_predicate.get_categories())
+ ),
+ pd.DataFrame(
+ data=[[5, 0], [5, 10]],
+ index=pd.Index((PatientCategories.YES, PatientCategories.NO,)),
+ columns=pd.Index(suox_gt_predicate.get_categories())
+ ),
+ ),
+ pvals=(math.nan, .005,),
+ corrected_pvals=(math.nan, .01),
+ gt_predicate=suox_gt_predicate,
+ mtc_filter_name='Random MTC filter',
+ mtc_filter_results=(
+ PhenotypeMtcResult.fail("Not too interesting"),
+ PhenotypeMtcResult.ok(),
+ ),
+ mtc_name='fdr_bh',
+ )
+
@pytest.fixture
def stats_viewer(self) -> MtcStatsViewer:
return MtcStatsViewer()
@@ -14,20 +55,8 @@ def stats_viewer(self) -> MtcStatsViewer:
def test_process(
self,
stats_viewer: MtcStatsViewer,
+ hpo_term_analysis_result: HpoTermAnalysisResult
):
- mtc_report = HpoMtcReport(
- filter_name='identity filter',
- mtc_name='bonferroni',
- filter_results_map={
- # The reason for skipping a phenotype -> the number of phenotypes skipped for the reason
- 'I slept bad tonight': 0,
- 'The sun is too bright today': 5,
- 'Life is a conspiracy': 80,
- 'I need coffee': 7,
- },
- n_terms_before_filtering=100, # The filtered out (80 + 7 + 5) + the unfiltered
- )
-
- report = stats_viewer.process(hpo_mtc_report=mtc_report)
+ report = stats_viewer.process(result=hpo_term_analysis_result)
with open('mtc_stats.html', 'w') as fh:
fh.write(report)