diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv
index 421e69272..0bc1e5c6c 100644
--- a/docs/report/tbx5_frameshift_vs_missense.csv
+++ b/docs/report/tbx5_frameshift_vs_missense.csv
@@ -1,262 +1,22 @@
"Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,,
,Count,Percent,Count,Percent,Corrected p values,p values
-Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0008990549794102921,5.6190936213143254e-05
-Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003478260869565217,0.00043478260869565214
-Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.014492753623188406,0.0036231884057971015
-Heart block [HP:0012722],0/22,0%,2/2,100%,0.014492753623188406,0.0036231884057971015
-Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.018011232501600187,0.005628510156750059
-Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.03598740440845704,0.01349527665317139
-Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.058517997576307476,0.02560162393963452
-Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.14881278039172777,0.07440639019586388
-Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2532484592139712,0.1424522583078588
-Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2699930339748754,0.1687456462342971
-Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6233766233766234,0.42857142857142855
-Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.7619047619047618,0.5714285714285713
-Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,0.9520604334894502,0.7735491022101784
-Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0
+Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0009552459156234353,5.6190936213143254e-05
+Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003695652173913043,0.00043478260869565214
+Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015
+Heart block [HP:0012722],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015
+Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.0191369345329502,0.005628510156750059
+Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.038236617183985605,0.01349527665317139
+Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.062175372424826694,0.02560162393963452
+Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.15811357916621074,0.07440639019586388
+Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2690764879148444,0.1424522583078588
+Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2868675985983051,0.1687456462342971
+Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855
+Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713
+Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784
+Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0
Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0
Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0
-Abnormal carpal morphology [HP:0001191],30/32,94%,0/0,0%,,
-Abnormal hand morphology [HP:0005922],53/53,100%,20/20,100%,,
-Abnormality of the hand [HP:0001155],60/60,100%,31/31,100%,,
-Abnormality of the upper limb [HP:0002817],73/73,100%,34/34,100%,,
-Abnormality of limbs [HP:0040064],73/73,100%,34/34,100%,,
-Phenotypic abnormality [HP:0000118],82/82,100%,38/38,100%,,
-All [HP:0000001],82/82,100%,38/38,100%,,
-Abnormality of the wrist [HP:0003019],30/30,100%,0/0,0%,,
-Abnormality of upper limb joint [HP:0009810],30/30,100%,6/6,100%,,
-Abnormal joint morphology [HP:0001367],31/31,100%,6/6,100%,,
-Abnormal skeletal morphology [HP:0011842],73/73,100%,35/35,100%,,
-Abnormality of the skeletal system [HP:0000924],73/73,100%,35/35,100%,,
-Abnormality of the musculoskeletal system [HP:0033127],74/74,100%,35/35,100%,,
-Short thumb [HP:0009778],11/41,27%,8/30,27%,,
-Aplasia/Hypoplasia of the thumb [HP:0009601],20/20,100%,19/19,100%,,
-Aplasia/Hypoplasia of fingers [HP:0006265],22/22,100%,19/19,100%,,
-Aplasia/hypoplasia involving bones of the hand [HP:0005927],22/22,100%,19/19,100%,,
-Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],55/55,100%,22/22,100%,,
-Aplasia/hypoplasia involving bones of the extremities [HP:0045060],55/55,100%,22/22,100%,,
-Aplasia/hypoplasia of the extremities [HP:0009815],55/55,100%,22/22,100%,,
-Aplasia/hypoplasia involving the skeleton [HP:0009115],56/56,100%,23/23,100%,,
-Abnormal limb bone morphology [HP:0002813],63/63,100%,34/34,100%,,
-Abnormality of limb bone [HP:0040068],63/63,100%,34/34,100%,,
-Abnormal appendicular skeleton morphology [HP:0011844],64/64,100%,34/34,100%,,
-Abnormal finger morphology [HP:0001167],36/36,100%,31/31,100%,,
-Abnormal digit morphology [HP:0011297],38/38,100%,33/33,100%,,
-Abnormal thumb morphology [HP:0001172],30/30,100%,31/31,100%,,
-Short finger [HP:0009381],11/11,100%,8/8,100%,,
-Short digit [HP:0011927],11/11,100%,10/10,100%,,
-Complete atrioventricular canal defect [HP:0001674],5/37,14%,3/36,8%,,
-Atrioventricular canal defect [HP:0006695],5/5,100%,3/3,100%,,
-Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,,
-Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,,
-Abnormal cardiovascular system morphology [HP:0030680],63/63,100%,30/30,100%,,
-Abnormality of the cardiovascular system [HP:0001626],65/65,100%,32/32,100%,,
-11 pairs of ribs [HP:0000878],1/1,100%,0/0,0%,,
-Missing ribs [HP:0000921],1/1,100%,0/0,0%,,
-Aplasia/Hypoplasia of the ribs [HP:0006712],1/1,100%,0/0,0%,,
-Aplasia/Hypoplasia involving bones of the thorax [HP:0006711],1/1,100%,2/2,100%,,
-Aplasia/hypoplasia affecting bones of the axial skeleton [HP:0009122],2/2,100%,2/2,100%,,
-Abnormal axial skeleton morphology [HP:0009121],8/8,100%,5/5,100%,,
-Abnormal thorax morphology [HP:0000765],6/6,100%,5/5,100%,,
-Abnormal rib morphology [HP:0000772],1/1,100%,0/0,0%,,
-Abnormal rib cage morphology [HP:0001547],4/4,100%,0/0,0%,,
-Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,,
-Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,,
-Abnormal cardiac atrium morphology [HP:0005120],43/43,100%,20/20,100%,,
-Finger aplasia [HP:0009380],15/15,100%,14/14,100%,,
-Aplasia involving forearm bones [HP:0009822],7/7,100%,6/6,100%,,
-Aplasia/hypoplasia involving forearm bones [HP:0006503],37/37,100%,12/12,100%,,
-Abnormal forearm bone morphology [HP:0040072],37/37,100%,14/14,100%,,
-Abnormal upper limb bone morphology [HP:0040070],40/40,100%,14/14,100%,,
-Abnormal forearm morphology [HP:0002973],37/37,100%,14/14,100%,,
-Aplasia/Hypoplasia of the radius [HP:0006501],37/37,100%,11/11,100%,,
-Abnormal morphology of the radius [HP:0002818],37/37,100%,13/13,100%,,
-Absent forearm bone [HP:0003953],7/7,100%,6/6,100%,,
-Forearm undergrowth [HP:0009821],30/30,100%,7/7,100%,,
-Upper limb undergrowth [HP:0009824],33/33,100%,7/7,100%,,
-Limb undergrowth [HP:0009826],33/33,100%,7/7,100%,,
-Short long bone [HP:0003026],35/35,100%,9/9,100%,,
-Abnormal long bone morphology [HP:0011314],44/44,100%,13/13,100%,,
+Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0
Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,,
-Abnormality of thumb phalanx [HP:0009602],13/13,100%,13/13,100%,,
-Limited pronation/supination of forearm [HP:0006394],0/0,0%,3/3,100%,,
-Limited elbow movement [HP:0002996],0/0,0%,4/4,100%,,
-Abnormality of the elbow [HP:0009811],0/0,0%,5/5,100%,,
-Limitation of joint mobility [HP:0001376],0/0,0%,4/4,100%,,
-Abnormality of joint mobility [HP:0011729],1/1,100%,5/5,100%,,
-Abnormal joint physiology [HP:0034430],1/1,100%,5/5,100%,,
-Abnormal musculoskeletal physiology [HP:0011843],1/1,100%,5/5,100%,,
-Abnormality of cardiovascular system electrophysiology [HP:0030956],15/15,100%,3/3,100%,,
-Abnormal cardiovascular system physiology [HP:0011025],23/23,100%,5/5,100%,,
-Pre-capillary pulmonary hypertension [HP:0033578],4/4,100%,0/0,0%,,
-Elevated pulmonary artery pressure [HP:0004890],4/4,100%,0/0,0%,,
-Abnormality of pulmonary circulation [HP:0030875],4/4,100%,0/0,0%,,
-Abnormal vascular physiology [HP:0030163],4/4,100%,0/0,0%,,
-Abnormality of the vasculature [HP:0002597],10/10,100%,2/2,100%,,
-Abnormal respiratory system physiology [HP:0002795],4/4,100%,0/0,0%,,
-Abnormality of the respiratory system [HP:0002086],4/4,100%,0/0,0%,,
-Tricuspid regurgitation [HP:0005180],3/3,100%,0/0,0%,,
-Atrioventricular valve regurgitation [HP:0034376],4/4,100%,2/2,100%,,
-Abnormal atrioventricular valve physiology [HP:0031650],4/4,100%,2/2,100%,,
-Abnormal heart valve physiology [HP:0031653],4/4,100%,2/2,100%,,
-Abnormal tricuspid valve physiology [HP:0031651],3/3,100%,0/0,0%,,
-Aplasia of the 1st metacarpal [HP:0010035],0/0,0%,0/0,0%,,
-Aplasia of the proximal phalanges of the hand [HP:0010242],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the proximal phalanges of the hand [HP:0009851],0/0,0%,0/0,0%,,
-Abnormal proximal phalanx morphology of the hand [HP:0009834],0/0,0%,0/0,0%,,
-Abnormal finger phalanx morphology [HP:0005918],3/3,100%,0/0,0%,,
-Aplasia/Hypoplasia of the phalanges of the hand [HP:0009767],0/0,0%,0/0,0%,,
-Aplasia of the phalanges of the hand [HP:0009802],0/0,0%,0/0,0%,,
-Aplasia involving bones of the upper limbs [HP:0009823],0/0,0%,0/0,0%,,
-Aplasia involving bones of the extremities [HP:0009825],0/0,0%,0/0,0%,,
-Aplasia of metacarpal bones [HP:0010048],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia involving the metacarpal bones [HP:0005914],0/0,0%,0/0,0%,,
-Abnormal metacarpal morphology [HP:0005916],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the 1st metacarpal [HP:0010026],0/0,0%,0/0,0%,,
-Abnormal 1st metacarpal morphology [HP:0010009],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the phalanges of the thumb [HP:0009658],0/0,0%,0/0,0%,,
-Partial absence of thumb [HP:0009659],0/0,0%,0/0,0%,,
-1-2 finger syndactyly [HP:0010704],3/3,100%,1/1,100%,,
-Finger syndactyly [HP:0006101],4/4,100%,2/2,100%,,
-Syndactyly [HP:0001159],4/4,100%,2/2,100%,,
-Upper limb phocomelia [HP:0009813],8/85,9%,2/37,5%,,
-Phocomelia [HP:0009829],8/8,100%,2/2,100%,,
-Clinodactyly of the 5th finger [HP:0004209],1/1,100%,0/0,0%,,
-Finger clinodactyly [HP:0040019],1/1,100%,0/0,0%,,
-Clinodactyly [HP:0030084],1/1,100%,0/0,0%,,
-Deviation of finger [HP:0004097],1/1,100%,2/2,100%,,
-Deviation of the hand or of fingers of the hand [HP:0009484],2/2,100%,2/2,100%,,
-Deviation of the 5th finger [HP:0009179],1/1,100%,0/0,0%,,
-Abnormal 5th finger morphology [HP:0004207],4/4,100%,0/0,0%,,
-Patent foramen ovale [HP:0001655],4/40,10%,0/36,0%,,
-Small hypothenar eminence [HP:0010487],2/2,100%,0/0,0%,,
-Abnormality of the hypothenar eminence [HP:0010486],2/2,100%,0/0,0%,,
-Abnormality of the musculature of the hand [HP:0001421],2/2,100%,0/0,0%,,
-Abnormality of the musculature of the upper limbs [HP:0001446],2/2,100%,0/0,0%,,
-Abnormality of the musculature of the limbs [HP:0009127],2/2,100%,0/0,0%,,
-Abnormal skeletal muscle morphology [HP:0011805],2/2,100%,0/0,0%,,
-Abnormality of the musculature [HP:0003011],2/2,100%,0/0,0%,,
-Perimembranous ventricular septal defect [HP:0011682],3/59,5%,3/25,12%,,
-Deviation of the thumb [HP:0009603],0/0,0%,2/2,100%,,
-Aplasia/hypoplasia of the humerus [HP:0006507],7/7,100%,4/4,100%,,
-Abnormality of the humerus [HP:0003063],7/7,100%,4/4,100%,,
-Abnormality of the upper arm [HP:0001454],7/7,100%,4/4,100%,,
-Right atrial enlargement [HP:0030718],4/4,100%,0/0,0%,,
-Abnormal right atrium morphology [HP:0025580],4/4,100%,0/0,0%,,
-Atrial septal dilatation [HP:0011995],4/4,100%,0/0,0%,,
-Pectus excavatum [HP:0000767],3/4,75%,2/2,100%,,
-Abnormal sternum morphology [HP:0000766],3/3,100%,2/2,100%,,
-Postaxial hand polydactyly [HP:0001162],3/4,75%,0/0,0%,,
-Postaxial polydactyly [HP:0100259],3/3,100%,0/0,0%,,
-Polydactyly [HP:0010442],3/3,100%,0/0,0%,,
-Hand polydactyly [HP:0001161],3/3,100%,0/0,0%,,
-Duplication of phalanx of hand [HP:0009997],3/3,100%,0/0,0%,,
-Duplication of hand bones [HP:0004275],3/3,100%,0/0,0%,,
-Duplication of bones involving the upper extremities [HP:0009142],3/3,100%,0/0,0%,,
-High palate [HP:0000218],3/3,100%,0/0,0%,,
-Abnormal palate morphology [HP:0000174],5/5,100%,0/0,0%,,
-Abnormal oral cavity morphology [HP:0000163],5/5,100%,1/1,100%,,
-Abnormal oral morphology [HP:0031816],5/5,100%,1/1,100%,,
-Abnormality of the mouth [HP:0000153],5/5,100%,1/1,100%,,
-Abnormality of the face [HP:0000271],5/5,100%,1/1,100%,,
-Abnormality of the head [HP:0000234],5/5,100%,2/2,100%,,
-Abnormality of head or neck [HP:0000152],5/5,100%,2/2,100%,,
-Short neck [HP:0000470],3/3,100%,0/0,0%,,
-Abnormal neck morphology [HP:0025668],3/3,100%,0/0,0%,,
-Abnormality of the neck [HP:0000464],3/3,100%,0/0,0%,,
-Abnormality of the cervical spine [HP:0003319],3/3,100%,0/0,0%,,
-Abnormality of the vertebral column [HP:0000925],4/4,100%,1/1,100%,,
-Shield chest [HP:0000914],3/3,100%,0/0,0%,,
-Enlarged thorax [HP:0100625],3/3,100%,0/0,0%,,
-Y-shaped metatarsals [HP:0010567],3/3,100%,0/0,0%,,
-Abnormal metatarsal morphology [HP:0001832],3/3,100%,0/0,0%,,
-Abnormal lower limb bone morphology [HP:0040069],3/3,100%,0/0,0%,,
-Abnormality of the lower limb [HP:0002814],3/3,100%,0/0,0%,,
-Abnormal foot morphology [HP:0001760],3/3,100%,0/0,0%,,
-Mitral regurgitation [HP:0001653],1/1,100%,2/2,100%,,
-Abnormal mitral valve physiology [HP:0031481],1/1,100%,2/2,100%,,
-First degree atrioventricular block [HP:0011705],0/22,0%,1/1,100%,,
-Congenital malformation of the great arteries [HP:0011603],4/4,100%,2/2,100%,,
-Abnormal morphology of the great vessels [HP:0030962],6/6,100%,2/2,100%,,
-Abnormal blood vessel morphology [HP:0033353],6/6,100%,2/2,100%,,
-Abnormal vascular morphology [HP:0025015],6/6,100%,2/2,100%,,
-Abnormal skull morphology [HP:0000929],1/1,100%,2/2,100%,,
-Sinus bradycardia [HP:0001688],0/0,0%,1/1,100%,,
-Abnormal electrophysiology of sinoatrial node origin [HP:0011702],0/0,0%,1/1,100%,,
-Arrhythmia [HP:0011675],1/1,100%,1/1,100%,,
-Bradycardia [HP:0001662],0/0,0%,1/1,100%,,
-Abnormal skin morphology [HP:0011121],0/0,0%,1/1,100%,,
-Abnormality of the skin [HP:0000951],0/0,0%,1/1,100%,,
-Abnormality of the integument [HP:0001574],0/0,0%,1/1,100%,,
-Sinus venosus atrial septal defect [HP:0011567],0/2,0%,1/1,100%,,
-Abnormal morphology of ulna [HP:0040071],2/2,100%,4/4,100%,,
-Aplasia/Hypoplasia of the ulna [HP:0006495],2/2,100%,2/2,100%,,
-Persistent left superior vena cava [HP:0005301],2/37,5%,0/0,0%,,
-Abnormal superior vena cava morphology [HP:0025575],2/2,100%,0/0,0%,,
-Abnormal vena cava morphology [HP:0005345],2/2,100%,0/0,0%,,
-Abnormal venous morphology [HP:0002624],2/2,100%,0/0,0%,,
-Abnormal shoulder morphology [HP:0003043],1/1,100%,1/1,100%,,
-Hypoplasia of deltoid muscle [HP:0030241],0/0,0%,0/0,0%,,
-Shoulder muscle hypoplasia [HP:0008952],0/0,0%,0/0,0%,,
-Hypoplasia of the musculature [HP:0009004],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia involving the skeletal musculature [HP:0001460],0/0,0%,0/0,0%,,
-Abnormality of muscle size [HP:0030236],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia involving the shoulder musculature [HP:0001464],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia involving the musculature of the upper limbs [HP:0001467],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia involving the musculature of the extremities [HP:0009128],0/0,0%,0/0,0%,,
-Abnormality of the shoulder girdle musculature [HP:0001435],0/0,0%,0/0,0%,,
-Common atrium [HP:0011565],0/83,0%,0/38,0%,,
-Unroofed coronary sinus [HP:0031297],0/85,0%,0/38,0%,,
-Atrioventricular dissociation [HP:0011709],0/22,0%,1/1,100%,,
-Bowed forearm bones [HP:0003956],0/0,0%,1/1,100%,,
-Bowing of the arm [HP:0006488],0/0,0%,1/1,100%,,
-Bowing of the long bones [HP:0006487],0/0,0%,1/1,100%,,
-Abnormal diaphysis morphology [HP:0000940],0/0,0%,1/1,100%,,
-Proximal placement of thumb [HP:0009623],0/0,0%,0/0,0%,,
-Short middle phalanx of the 5th finger [HP:0004220],0/0,0%,0/0,0%,,
-Type A brachydactyly [HP:0009370],0/0,0%,0/0,0%,,
-Brachydactyly [HP:0001156],0/0,0%,0/0,0%,,
-Short 5th finger [HP:0009237],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the 5th finger [HP:0006262],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the middle phalanx of the 5th finger [HP:0009161],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the phalanges of the 5th finger [HP:0009376],0/0,0%,0/0,0%,,
-Abnormal 5th finger phalanx morphology [HP:0004213],0/0,0%,0/0,0%,,
-Abnormality of the middle phalanx of the 5th finger [HP:0004219],0/0,0%,0/0,0%,,
-Short middle phalanx of finger [HP:0005819],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the middle phalanges of the hand [HP:0009843],0/0,0%,0/0,0%,,
-Abnormal middle phalanx morphology of the hand [HP:0009833],0/0,0%,0/0,0%,,
-Short phalanx of finger [HP:0009803],0/0,0%,0/0,0%,,
-Micrognathia [HP:0000347],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia of the mandible [HP:0009118],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia involving bones of the skull [HP:0009116],1/1,100%,1/1,100%,,
-Abnormal mandible morphology [HP:0000277],1/1,100%,1/1,100%,,
-Abnormal jaw morphology [HP:0030791],1/1,100%,1/1,100%,,
-Abnormal facial skeleton morphology [HP:0011821],1/1,100%,1/1,100%,,
-Cleft soft palate [HP:0000185],2/2,100%,0/0,0%,,
-Abnormal soft palate morphology [HP:0100736],2/2,100%,0/0,0%,,
-Cleft palate [HP:0000175],2/2,100%,0/0,0%,,
-Orofacial cleft [HP:0000202],2/2,100%,0/0,0%,,
-Craniofacial cleft [HP:5201015],2/2,100%,0/0,0%,,
-Hypoplastic scapulae [HP:0000882],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia of the scapulae [HP:0006713],1/1,100%,1/1,100%,,
-Abnormal scapula morphology [HP:0000782],1/1,100%,1/1,100%,,
-Amelia involving the upper limbs [HP:0009812],0/83,0%,1/37,3%,,
-Third degree atrioventricular block [HP:0001709],0/22,0%,1/1,100%,,
-Upper extremity joint dislocation [HP:0030310],0/0,0%,2/2,100%,,
-Joint dislocation [HP:0001373],0/0,0%,2/2,100%,,
-Abnormal toe morphology [HP:0001780],0/0,0%,0/0,0%,,
-Abnormal toe phalanx morphology [HP:0010161],0/0,0%,0/0,0%,,
-Abnormality of the distal phalanges of the toes [HP:0010182],0/0,0%,0/0,0%,,
-Aplasia/Hypoplasia of the 2nd finger [HP:0006264],1/1,100%,0/0,0%,,
-Abnormal 2nd finger morphology [HP:0004100],1/1,100%,0/0,0%,,
-Synostosis of joints [HP:0100240],1/1,100%,1/1,100%,,
-Left ventricular noncompaction cardiomyopathy [HP:0011664],0/1,0%,1/5,20%,,
-Noncompaction cardiomyopathy [HP:0012817],0/0,0%,1/1,100%,,
-Cardiomyopathy [HP:0001638],0/0,0%,1/1,100%,,
-Abnormal myocardium morphology [HP:0001637],0/0,0%,1/1,100%,,
-Mitral valve prolapse [HP:0001634],0/0,0%,1/1,100%,,
-Abnormal mitral valve morphology [HP:0001633],0/0,0%,1/1,100%,,
-Abnormal atrioventricular valve morphology [HP:0006705],0/0,0%,1/1,100%,,
-Abnormal heart valve morphology [HP:0001654],0/0,0%,1/1,100%,,
-Short 1st metacarpal [HP:0010034],0/30,0%,0/22,0%,,
-Short phalanx of the thumb [HP:0009660],0/30,0%,0/22,0%,,
+Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,,
+Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,,
diff --git a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html
index 5b149af79..b6ac17472 100644
--- a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html
+++ b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html
@@ -48,9 +48,9 @@
Phenotype testing report
Phenotype MTC filter: HPO MTC filter
Multiple testing correction: fdr_bh
- Performed statistical tests for 16 out of the total of 260 HPO terms.
+ Performed statistical tests for 17 out of the total of 260 HPO terms.
- Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.
+ Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.
Code |
@@ -61,21 +61,21 @@ Phenotype testing report
TODO |
- Skipping general term |
- 44 |
+ Skipping term with maximum frequency that was less than threshold 0.2 |
+ 51 |
TODO |
- Skipping term because all genotypes have same HPO observed proportions |
- 42 |
+ Skipping general term |
+ 44 |
TODO |
- Skipping term with only 0 observations (not powered for 2x2) |
+ Skipping term because all genotypes have same HPO observed proportions |
41 |
@@ -114,13 +114,6 @@ Phenotype testing report
12 |
-
-
- TODO |
- Skipping term with maximum frequency that was less than threshold 0.2 |
- 10 |
-
-
TODO |
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 10ce6a23a..0a583b39e 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -171,7 +171,7 @@ in the individuals of the *TBX5* cohort.
... ),
... group_names=('Missense', 'Frameshift'),
... )
->>> gt_predicate.get_question()
+>>> gt_predicate.display_question()
'Genotype group: Missense, Frameshift'
.. note::
@@ -224,8 +224,8 @@ with a false discovery control level at (``mtc_alpha=0.05``):
Choosing the statistical procedure for assessment of association between genotype and phenotype
groups is the last missing piece of the analysis. We will use Fisher Exact Test:
->>> from gpsea.analysis.pcats.stats import ScipyFisherExact
->>> count_statistic = ScipyFisherExact()
+>>> from gpsea.analysis.pcats.stats import FisherExactTest
+>>> count_statistic = FisherExactTest()
and we finalize the analysis setup by putting all components together
into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`:
@@ -246,15 +246,15 @@ Now we can perform the analysis and investigate the results.
... pheno_predicates=pheno_predicates,
... )
>>> result.total_tests
-16
+17
-We only tested 16 HPO terms. This is despite the individuals being collectively annotated with
+We only tested 1y HPO terms. This is despite the individuals being collectively annotated with
260 direct and indirect HPO terms
>>> len(result.phenotypes)
260
-We can show the reasoning behind *not* testing 244 (`260 - 16`) HPO terms
+We can show the reasoning behind *not* testing 243 (`260 - 17`) HPO terms
by exploring the phenotype MTC filtering report.
>>> from gpsea.view import MtcStatsViewer
@@ -266,11 +266,11 @@ by exploring the phenotype MTC filtering report.
.. raw:: html
:file: report/tbx5_frameshift_vs_missense.mtc_report.html
-and these are the HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure:
+and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure:
>>> from gpsea.analysis.predicate import PatientCategories
>>> summary_df = result.summarize(hpo, PatientCategories.YES)
->>> summary_df.to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP
+>>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP
.. csv-table:: *TBX5* frameshift vs missense
:file: report/tbx5_frameshift_vs_missense.csv
@@ -283,4 +283,4 @@ was observed in 31/60 (52%) patients with a missense variant
but it was observed in 19/19 (100%) patients with a frameshift variant.
Fisher exact test computed a p value of `~0.0000562`
and the p value corrected by Benjamini-Hochberg procedure
-is `~0.00112`.
+is `~0.000955`.
diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst
index f8958b8ae..070539d19 100644
--- a/docs/user-guide/mtc.rst
+++ b/docs/user-guide/mtc.rst
@@ -96,9 +96,9 @@ when creating an instance of :class:`~gpsea.analysis.pcats.HpoTermAnalysis`:
>>> from gpsea.analysis.mtc_filter import UseAllTermsMtcFilter
>>> from gpsea.analysis.pcats import HpoTermAnalysis
->>> from gpsea.analysis.pcats.stats import ScipyFisherExact
+>>> from gpsea.analysis.pcats.stats import FisherExactTest
>>> analysis = HpoTermAnalysis(
-... count_statistic=ScipyFisherExact(),
+... count_statistic=FisherExactTest(),
... mtc_filter=UseAllTermsMtcFilter(),
... mtc_correction='bonferroni', # <--- The MTC correction setup
... )
diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst
index 00647f1f2..b224516f5 100644
--- a/docs/user-guide/predicates.rst
+++ b/docs/user-guide/predicates.rst
@@ -4,23 +4,80 @@
Predicates
==========
-GPSEA uses predicates to test if variant, patient, or any tested item
-meets a condition. Based on the test results, the items are assigned into groups.
+Searching for genotype-phenotype associations usually requires to partition
+the individuals into several discrete groups to allow testing for the inter-group differences.
+GPSEA reflects these requirements with its predicate API.
+Perhaps unsurprisingly, a predicate must be capable of partitioning the individuals into two or more groups.
+The groups must be *exclusive* - each individual must be assigned at most into one group.
+Moreover, the groups should be *exhaustive* and cover maximum of the possible states.
+However, the predicate is allowed to return `None` if the individual cannot be assigned.
+In result, the individual will be omitted from the downstream analysis.
+
+Predicates serve both *genotype* and *phenotype* prongs of the analysis.
+Genotype predicates (:class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`)
+assign the :class:`~gpsea.model.Patient`
+into a group (mostly) based on the variant information, while the
+phenotype predicates (:class:`~gpsea.analysis.predicate.phenotype.PhenotypePolyPredicate`)
+use the HPO terms to assign a group.
+
+It is literally impossible to use GPSEA without the predicates
+because all analyses need at least one predicate (typically a *genotype* predicate).
+Luckily, the purpose of this guide is to show all that is to know about predicates.
+We will first discuss the genotype predicates and end with phenotype predicates.
+
+.. _genotype-predicates:
-As described in the :class:`~gpsea.analysis.predicate.PolyPredicate` API,
-the groups must be *exclusive* - the item can be assigned with one and only one group,
-and *exhaustive* - the groups must cover all possible scenarios.
+*******************
+Genotype predicates
+*******************
+
+A genotype predicate seeks to divide the individuals along an axis that is orthogonal to phenotypes.
+Typically, this includes using the genotype data, such as presence of a missense variant
+in a heterozygous genotype. However, other categorical variables,
+such as diagnoses (TODO - add link to disease predicate) or cluster ids can also be used.
+
+The genotype predicates test the individual for a presence of variants that meet certain inclusion criteria.
+The testing is done in two steps. First, we count the alleles
+of the matching variants and then we interpret the count, possibly including factors
+such as the expected mode of inheritance and sex, to assign the individual into a group.
+Finding the matching variants is what
+the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about.
+
+
+TODO: wordsmith
+We must first create the variant predicate and then wrap it in genotype predicate.
+
+
+Variant predicates
+==================
+
+GPSEA uses the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` class
+to test if a :class:`~gpsea.model.Variant` meets the inclusion criteria.
+The variant predicate can leverage multiple primary data:
-However, if the item cannot be assigned into any meaningful category,
-the predicate can return `None`, and the item will be omitted from the analysis.
++------------------------+-------------------------------------------------------------------------------------------------+
+| Primary data source | Example |
++========================+=================================================================================================+
+| Allele | the variant being a deletion or a single nucleotide variant (SNV) |
++------------------------+-------------------------------------------------------------------------------------------------+
+| Genome | overlaps of a target genomic region |
++------------------------+-------------------------------------------------------------------------------------------------+
+| Functional annotation | variant is predicted to lead to a missense change or affect an exon of certain transcript |
++------------------------+-------------------------------------------------------------------------------------------------+
+| Protein data | variant is located in a region encoding a protein domain, protein feature type |
++------------------------+-------------------------------------------------------------------------------------------------+
-The predicates can be chained to test for more complex conditions.
-For instance, "test if a patient has a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`".
+which demands a considerable amount of flexibility for creating the predicate.
-Let's demonstrate this on an example with a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`.
-We will load a cohort of 5 subjects with variants in *ANKRD11*, leading to KBG syndrome.
-The the clinical signs and symptoms of the subjects were encoded into HPO terms
-along with the pathogenic *ANKRD11* variant.
+As a rule of thumb, the predicates for testing basic conditions are available off the shelf,
+and they can be used as building block for testing for more complex conditions,
+such as testing if the variant is "a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`".
+
+Let's demonstrate this on few examples.
+We will load a cohort of 19 subjects with variants in *RERE*,
+leading to `Holt-Oram syndrome MIM:142900 `_.
+The the clinical signs and symptoms of the subjects were encoded into HPO terms
+along with the pathogenic *RERE* variant.
Let's load the phenopackets, as previously described in greater detail the :ref:`input-data` section.
Briefly, we first load HPO:
@@ -36,7 +93,7 @@ then, we configure the cohort creator:
which we use to create a :class:`~gpsea.model.Cohort` from a bunch of phenopackets
from the release `0.1.18` of `Phenopacket Store `_.
-This time, however, we will load 19 individuals with mutations in *RERE* gene:
+We will load 19 individuals with mutations in *RERE* gene:
>>> from ppktstore.registry import configure_phenopacket_registry
>>> registry = configure_phenopacket_registry()
@@ -58,8 +115,8 @@ that replaces the histidine encoded by the 1435th codon of `NM_001042681.2` with
>>> variant_key_of_interest = '1_8358231_8358231_T_C'
>>> variant = cohort.get_variant_by_key(variant_key_of_interest)
-Simple predicates
-*****************
+Building blocks
+---------------
We can check that the variant overlaps with *RERE*:
@@ -92,10 +149,10 @@ See :class:`~gpsea.analysis.predicate.genotype.VariantPredicates`
for more info on the predicates available off the shelf.
-Compound predicates
-*******************
+Complex conditions
+------------------
-The simple predicates can be combined to test for more elaborate conditions.
+We can combine the building blocks to test for more elaborate conditions.
For instance, we can test if the variant meets *any* or several conditions:
>>> nonsense = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id)
@@ -121,8 +178,8 @@ such as testing if the variant is a *"chromosomal deletion" or a deletion which
'(structural type is SO:1000029 OR (variant class is DEL AND change length <= -50))'
-Inverted predicate
-******************
+Inverting conditions
+--------------------
Sometimes we may want to test the variant for a condition that must *not* be met.
For instance, we may want to test if the variant is a deletion
@@ -151,9 +208,277 @@ This is how we can use the predicate inversion to build the predicate for non-fr
Note the presence of a tilde ``~`` before the variant effect predicate and resulting ``NOT`` in the predicate question.
+The variant predicate offers a flexible API for testing if variants meet a condition.
+However, the genotype phenotype correlations are done on the individual level
+and the variant predicates are used as a component of the genotype predicate.
+The next sections show how to use variant predicates to assign individuals into groups.
+
+
+Mode of inheritance predicate
+=============================
+
+The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate`
+assigns the individual into a group based on the number of alleles
+that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`.
+The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports
+the following Mendelian modes of inheritance (MoI):
+
+
++-----------------------+-----------------------------------+------------------+------------------------+
+| Mode of inheritance | Sex | Allele count | Genotype category |
++=======================+===================================+==================+========================+
+| Autosomal dominant | `*` | 0 | `HOM_REF` |
++ +-----------------------------------+------------------+------------------------+
+| | `*` | 1 | `HET` |
++ +-----------------------------------+------------------+------------------------+
+| | `*` | :math:`\ge 2` | ``None`` |
++-----------------------+-----------------------------------+------------------+------------------------+
+| Autosomal recessive | `*` | 0 | `HOM_REF` |
++ +-----------------------------------+------------------+------------------------+
+| | `*` | 1 | `HET` |
++ +-----------------------------------+------------------+------------------------+
+| | `*` | 2 | `BIALLELIC_ALT` |
++ +-----------------------------------+------------------+------------------------+
+| | `*` | :math:`\ge 3` | ``None`` |
++-----------------------+-----------------------------------+------------------+------------------------+
+| X-linked dominant | `*` | 0 | `HOM_REF` |
++ +-----------------------------------+------------------+------------------------+
+| | `*` | 1 | `HET` |
++ +-----------------------------------+------------------+------------------------+
+| | `*` | :math:`\ge 2` | ``None`` |
++-----------------------+-----------------------------------+------------------+------------------------+
+| X-linked recessive | `*` | 0 | `HOM_REF` |
++ +-----------------------------------+------------------+------------------------+
+| | :class:`~gpsea.model.Sex.FEMALE` | 1 | `HET` |
++ + +------------------+------------------------+
+| | | 2 | `BIALLELIC_ALT` |
++ + +------------------+------------------------+
+| | | :math:`\ge 3` | ``None`` |
++ +-----------------------------------+------------------+------------------------+
+| | :class:`~gpsea.model.Sex.MALE` | 1 | `HEMI` |
++ + +------------------+------------------------+
+| | | :math:`\ge 2` | ``None`` |
++-----------------------+-----------------------------------+------------------+------------------------+
+
+.. note::
+
+ `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes.
+
+Clinical judgment should be used to choose the MoI for the cohort analysis.
+Then a predicate for the desired MoI can be created by one of
+:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors:
+
+* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant`
+* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive`
+* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_dominant`
+* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_recessive`
+
+All constructors take an instance
+of :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` as an argument.
+
+
+Example
+-------
+
+Here we show seting up a predicate for grouping individuals based on
+having a variant that leads to a frameshift or to a stop gain to a fictional transcript ``NM_1234.5``
+to test differences between the genotypes of a disease with an autosomal recessive MoI.
+
+First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`
+for testing if a variant meets the condition:
+
+>>> from gpsea.model import VariantEffect
+>>> from gpsea.analysis.predicate.genotype import VariantPredicates
+>>> tx_id = 'NM_1234.5'
+>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \
+... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id)
+>>> is_frameshift_or_stop_gain.get_question()
+'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)'
+
+Next, we use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive`
+for assigning a patient into a genotype group:
+
+>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate
+>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain)
+>>> gt_predicate.display_question()
+'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT'
+
+The `gt_predicate` can be used in downstream analysis, such as in :class:
+
+
+.. _filtering-predicate:
+
+Filtering predicate
+===================
+
+Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need
+to consider only a subset of the groups. A `GenotypePolyPredicate`
+created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset
+of the target categorizations of interest.
+
+Example
+-------
+
+Let's suppose we want test the genotype-phenotype association between variants
+that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`,
+and we are specifically interested in comparing the heterozygous variants
+in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous).
+
+First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`
+for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame:
+
+>>> from gpsea.model import VariantEffect
+>>> from gpsea.analysis.predicate.genotype import VariantPredicates
+>>> tx_id = 'NM_1234.5'
+>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \
+... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id)
+>>> is_frameshift_or_stop_gain.get_question()
+'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)'
+
+Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive`
+to bin according to a genotype group:
+
+>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate
+>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain)
+>>> gt_predicate.display_question()
+'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT'
+
+We see that the `gt_predicate` bins the patients into three groups:
+
+>>> cats = gt_predicate.get_categorizations()
+>>> cats
+(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT))
+
+We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function,
+and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`
+that includes only the categories of interest:
+
+>>> from gpsea.analysis.predicate.genotype import filtering_predicate
+>>> fgt_predicate = filtering_predicate(
+... predicate=gt_predicate,
+... targets=(cats[1], cats[2]),
+... )
+>>> fgt_predicate.display_question()
+'Which genotype group does the patient fit in: HET, BIALLELIC_ALT'
+
+
+.. _groups-predicate:
+
+Groups predicate
+================
+
+Sometimes, all we want is to compare if there is a difference between individuals
+who include one or more alleles of variant $X$ vs. individuals with variants $Y$,
+vs. individuals with variants $Z$, where $X$, $Y$ and $Z$ are variant predicates.
+We can do this with a *groups* predicate.
+
+The :func:`~gpsea.analysis.predicate.genotype.groups_predicate`
+takes *n* variant predicates and *n* group labels, and it will assign the patients
+into the respective groups if one or more matching allele is found.
+However, only one predicate is allowed to return a non-zero allele count.
+Otherwise, the patient is assigned with ``None`` and excluded from the analysis.
+
+Example
+-------
+
+Here we show how to build a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`
+for testing if the individual has at least one missense vs. frameshift vs. synonymous variant.
+
+>>> from gpsea.model import VariantEffect
+>>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate
+>>> tx_id = 'NM_1234.5'
+>>> gt_predicate = groups_predicate(
+... predicates=(
+... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id),
+... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id),
+... VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, tx_id),
+... ),
+... group_names=('Missense', 'Frameshift', 'Synonymous'),
+... )
+>>> gt_predicate.display_question()
+'Genotype group: Missense, Frameshift, Synonymous'
+
+
+.. _phenotype-predicates:
+
+********************
+Phenotype predicates
+********************
+
+The phenotype predicate assigns the individual into a group with respect to tested phenotype.
+Typically, the phenotype corresponds to a clinical sign or symptom encoded into an HPO term.
+
+
+Propagating phenotype predicate
+===============================
+
+When testing for presence or absence of an HPO term, the propagating phenotype predicate
+leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy.
+In result, an individual annotated with a term is implicitly annotated with all its ancestors.
+For instance, an individual annotated with `Ectopia lentis `_
+is also annotated with `Abnormal lens morphology `_,
+`Abnormal anterior eye segment morphology `_,
+`Abnormal eye morphology `_, ...
+
+Similarly, all descendants of a term, whose presence was specifically excluded in an individual,
+are implicitly excluded.
+
+:class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` implements this logic.
+
+Example
+-------
+
+Here we show how to set up :class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate`
+to test for a presence of `Abnormal lens morphology `_.
+
+
+>>> from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate
+>>> query = hpotk.TermId.from_curie('HP:0000517')
+>>> pheno_predicate = PropagatingPhenotypePredicate(
+... hpo=hpo,
+... query=query,
+... )
+>>> pheno_predicate.display_question()
+'Is Abnormal lens morphology present in the patient: Yes, No'
+
+
+TODO: explain ``missing_implies_phenotype_excluded``
+
+
+Predicates for all cohort phenotypes
+====================================
+
+Constructing phenotype predicates for all HPO terms of a cohort sounds a bit tedious.
+The :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest`
+function cuts down the tedium:
+
+>>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest
+>>> pheno_predicates = prepare_predicates_for_terms_of_interest(
+... cohort=cohort,
+... hpo=hpo,
+... )
+>>> len(pheno_predicates)
+301
+
+and prepares predicates for testing 301 HPO terms of the *RERE* cohort.
+
+
+*******
+Gallery
+*******
+
+Here we show examples of predicates used in some of our analyses.
+
+TODO
+
+
+**********
+Need more?
+**********
-That's it for predicates. Please see :class:`~gpsea.analysis.predicate.genotype.VariantPredicates`
+Please see :class:`~gpsea.analysis.predicate.genotype.VariantPredicates`
and :class:`~gpsea.analysis.predicate.genotype.ProteinPredicates`
-for a comprehensive list of the predicates available off the shelf.
+for a list of the predicates available off the shelf.
-Please open an issue on our `GitHub tracker `_ if a predicate seems to be missing.
+However, feel free to open an issue on our `GitHub tracker `_
+if a predicate seems to be missing.
diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv
index 6abe64be2..a96700cfb 100644
--- a/docs/user-guide/report/tbx5_frameshift.csv
+++ b/docs/user-guide/report/tbx5_frameshift.csv
@@ -1,262 +1,22 @@
-FRAMESHIFT_VARIANT on NM_181486.4,Yes,Yes,No,No,,
+"Which genotype group does the patient fit in: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,,
,Count,Percent,Count,Percent,Corrected p values,p values
-Ventricular septal defect [HP:0001629],19/19,100%,42/71,59%,0.004112753923262261,0.00024192670136836828
-Abnormal atrioventricular conduction [HP:0005150],3/3,100%,1/23,4%,0.013076923076923076,0.0015384615384615385
-Absent thumb [HP:0009777],14/31,45%,18/100,18%,0.021044138590779585,0.0037136715160199273
-Atrioventricular block [HP:0001678],2/2,100%,1/23,4%,0.034,0.010000000000000002
-Heart block [HP:0012722],2/2,100%,1/23,4%,0.034,0.010000000000000002
-Patent ductus arteriosus [HP:0001643],2/2,100%,6/40,15%,0.09214092140921408,0.03252032520325203
-Secundum atrial septal defect [HP:0001684],4/22,18%,23/55,42%,0.1440020479198931,0.06544319142266644
-Triphalangeal thumb [HP:0001199],13/32,41%,23/99,23%,0.1440020479198931,0.06932119159387057
-Cardiac conduction abnormality [HP:0031546],3/3,100%,15/37,41%,0.1440020479198931,0.08259109311740892
-Muscular ventricular septal defect [HP:0011623],6/25,24%,8/84,10%,0.1440020479198931,0.08470708701170182
-Pulmonary arterial hypertension [HP:0002092],0/2,0%,8/14,57%,0.6899307928951144,0.4666666666666667
-Short thumb [HP:0009778],8/30,27%,25/69,36%,0.6899307928951144,0.4870099714553749
-Absent radius [HP:0003974],6/25,24%,9/43,21%,1.0,0.7703831604944444
-Hypoplasia of the radius [HP:0002984],6/14,43%,34/75,45%,1.0,1.0
-Atrial septal defect [HP:0001631],20/20,100%,63/65,97%,1.0,1.0
-Hypoplasia of the ulna [HP:0003022],2/10,20%,3/17,18%,1.0,1.0
-Short humerus [HP:0005792],4/9,44%,8/21,38%,1.0,1.0
-Abnormal ventricular septum morphology [HP:0010438],19/19,100%,42/42,100%,,
-Abnormal cardiac ventricle morphology [HP:0001713],19/19,100%,43/43,100%,,
-Abnormal heart morphology [HP:0001627],30/30,100%,89/89,100%,,
-Abnormal cardiovascular system morphology [HP:0030680],30/30,100%,92/92,100%,,
-Abnormality of the cardiovascular system [HP:0001626],32/32,100%,94/94,100%,,
-Phenotypic abnormality [HP:0000118],38/38,100%,114/114,100%,,
-All [HP:0000001],38/38,100%,114/114,100%,,
-Abnormal cardiac septum morphology [HP:0001671],28/28,100%,89/89,100%,,
-Forearm undergrowth [HP:0009821],7/7,100%,35/35,100%,,
-Abnormal upper limb bone morphology [HP:0040070],14/14,100%,50/50,100%,,
-Abnormality of the upper limb [HP:0002817],34/34,100%,102/102,100%,,
-Abnormality of limbs [HP:0040064],34/34,100%,102/102,100%,,
-Abnormal limb bone morphology [HP:0002813],34/34,100%,92/92,100%,,
-Abnormality of limb bone [HP:0040068],34/34,100%,92/92,100%,,
-Abnormality of the skeletal system [HP:0000924],35/35,100%,103/103,100%,,
-Abnormality of the musculoskeletal system [HP:0033127],35/35,100%,104/104,100%,,
-Abnormal appendicular skeleton morphology [HP:0011844],34/34,100%,93/93,100%,,
-Abnormal skeletal morphology [HP:0011842],35/35,100%,103/103,100%,,
-Upper limb undergrowth [HP:0009824],7/7,100%,38/38,100%,,
-Limb undergrowth [HP:0009826],7/7,100%,38/38,100%,,
-Aplasia/hypoplasia of the extremities [HP:0009815],22/22,100%,78/78,100%,,
-Aplasia/hypoplasia involving the skeleton [HP:0009115],23/23,100%,80/80,100%,,
-Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],22/22,100%,78/78,100%,,
-Aplasia/hypoplasia involving bones of the extremities [HP:0045060],22/22,100%,78/78,100%,,
-Aplasia/hypoplasia involving forearm bones [HP:0006503],12/12,100%,43/43,100%,,
-Abnormal forearm bone morphology [HP:0040072],14/14,100%,43/43,100%,,
-Abnormal forearm morphology [HP:0002973],14/14,100%,43/43,100%,,
-Short long bone [HP:0003026],9/9,100%,41/41,100%,,
-Abnormal long bone morphology [HP:0011314],13/13,100%,50/50,100%,,
-Aplasia/Hypoplasia of the radius [HP:0006501],11/11,100%,43/43,100%,,
-Abnormal morphology of the radius [HP:0002818],13/13,100%,43/43,100%,,
-1-2 finger syndactyly [HP:0010704],1/1,100%,4/4,100%,,
-Finger syndactyly [HP:0006101],2/2,100%,5/5,100%,,
-Syndactyly [HP:0001159],2/2,100%,5/5,100%,,
-Abnormal digit morphology [HP:0011297],33/33,100%,67/67,100%,,
-Abnormal atrial septum morphology [HP:0011994],20/20,100%,64/64,100%,,
-Abnormal cardiac atrium morphology [HP:0005120],20/20,100%,64/64,100%,,
-Perimembranous ventricular septal defect [HP:0011682],3/25,12%,6/84,7%,,
-Aplasia/Hypoplasia of the thumb [HP:0009601],19/19,100%,40/40,100%,,
-Aplasia/Hypoplasia of fingers [HP:0006265],19/19,100%,44/44,100%,,
-Aplasia/hypoplasia involving bones of the hand [HP:0005927],19/19,100%,44/44,100%,,
-Abnormal hand morphology [HP:0005922],20/20,100%,75/75,100%,,
-Abnormality of the hand [HP:0001155],31/31,100%,88/88,100%,,
-Abnormal finger morphology [HP:0001167],31/31,100%,64/64,100%,,
-Abnormal thumb morphology [HP:0001172],31/31,100%,58/58,100%,,
-Finger aplasia [HP:0009380],14/14,100%,23/23,100%,,
-Aplasia involving forearm bones [HP:0009822],6/6,100%,9/9,100%,,
-Absent forearm bone [HP:0003953],6/6,100%,9/9,100%,,
-Aplasia/Hypoplasia of the ulna [HP:0006495],2/2,100%,4/4,100%,,
-Abnormal morphology of ulna [HP:0040071],4/4,100%,4/4,100%,,
-Abnormal carpal morphology [HP:0001191],0/0,0%,30/32,94%,,
-Abnormality of the wrist [HP:0003019],0/0,0%,30/30,100%,,
-Abnormality of upper limb joint [HP:0009810],6/6,100%,32/32,100%,,
-Abnormal joint morphology [HP:0001367],6/6,100%,33/33,100%,,
-Abnormality of cardiovascular system electrophysiology [HP:0030956],3/3,100%,18/18,100%,,
-Abnormal cardiovascular system physiology [HP:0011025],5/5,100%,30/30,100%,,
-Abnormality of thumb phalanx [HP:0009602],13/13,100%,26/26,100%,,
-Upper limb phocomelia [HP:0009813],2/37,5%,8/116,7%,,
-Phocomelia [HP:0009829],2/2,100%,8/8,100%,,
-Short finger [HP:0009381],8/8,100%,27/27,100%,,
-Short digit [HP:0011927],10/10,100%,28/28,100%,,
-Pre-capillary pulmonary hypertension [HP:0033578],0/0,0%,8/8,100%,,
-Elevated pulmonary artery pressure [HP:0004890],0/0,0%,8/8,100%,,
-Abnormality of pulmonary circulation [HP:0030875],0/0,0%,8/8,100%,,
-Abnormal vascular physiology [HP:0030163],0/0,0%,8/8,100%,,
-Abnormality of the vasculature [HP:0002597],2/2,100%,17/17,100%,,
-Abnormal respiratory system physiology [HP:0002795],0/0,0%,8/8,100%,,
-Abnormality of the respiratory system [HP:0002086],0/0,0%,8/8,100%,,
-Congenital malformation of the great arteries [HP:0011603],2/2,100%,7/7,100%,,
-Abnormal morphology of the great vessels [HP:0030962],2/2,100%,10/10,100%,,
-Abnormal blood vessel morphology [HP:0033353],2/2,100%,11/11,100%,,
-Abnormal vascular morphology [HP:0025015],2/2,100%,11/11,100%,,
-Patent foramen ovale [HP:0001655],0/36,0%,4/69,6%,,
-Synostosis of joints [HP:0100240],1/1,100%,1/1,100%,,
-Abnormality of joint mobility [HP:0011729],5/5,100%,3/3,100%,,
-Abnormal joint physiology [HP:0034430],5/5,100%,3/3,100%,,
-Abnormal musculoskeletal physiology [HP:0011843],5/5,100%,3/3,100%,,
-Abnormality of the vertebral column [HP:0000925],1/1,100%,4/4,100%,,
-Abnormal axial skeleton morphology [HP:0009121],5/5,100%,9/9,100%,,
-Complete atrioventricular canal defect [HP:0001674],3/36,8%,6/67,9%,,
-Atrioventricular canal defect [HP:0006695],3/3,100%,6/6,100%,,
-Hypoplasia of deltoid muscle [HP:0030241],0/0,0%,6/6,100%,,
-Shoulder muscle hypoplasia [HP:0008952],0/0,0%,6/6,100%,,
-Hypoplasia of the musculature [HP:0009004],0/0,0%,6/6,100%,,
-Aplasia/Hypoplasia involving the skeletal musculature [HP:0001460],0/0,0%,6/6,100%,,
-Abnormality of muscle size [HP:0030236],0/0,0%,6/6,100%,,
-Abnormal skeletal muscle morphology [HP:0011805],0/0,0%,8/8,100%,,
-Abnormality of the musculature [HP:0003011],0/0,0%,8/8,100%,,
-Aplasia/Hypoplasia involving the shoulder musculature [HP:0001464],0/0,0%,6/6,100%,,
-Aplasia/Hypoplasia involving the musculature of the upper limbs [HP:0001467],0/0,0%,6/6,100%,,
-Aplasia/Hypoplasia involving the musculature of the extremities [HP:0009128],0/0,0%,6/6,100%,,
-Abnormality of the musculature of the limbs [HP:0009127],0/0,0%,8/8,100%,,
-Abnormality of the musculature of the upper limbs [HP:0001446],0/0,0%,8/8,100%,,
-Abnormality of the shoulder girdle musculature [HP:0001435],0/0,0%,6/6,100%,,
-Bowed forearm bones [HP:0003956],1/1,100%,0/0,0%,,
-Bowing of the arm [HP:0006488],1/1,100%,0/0,0%,,
-Bowing of the long bones [HP:0006487],1/1,100%,0/0,0%,,
-Abnormal diaphysis morphology [HP:0000940],1/1,100%,0/0,0%,,
-Micrognathia [HP:0000347],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia of the mandible [HP:0009118],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia involving bones of the skull [HP:0009116],1/1,100%,1/1,100%,,
-Aplasia/hypoplasia affecting bones of the axial skeleton [HP:0009122],2/2,100%,3/3,100%,,
-Abnormal skull morphology [HP:0000929],2/2,100%,1/1,100%,,
-Abnormality of the head [HP:0000234],2/2,100%,5/5,100%,,
-Abnormality of head or neck [HP:0000152],2/2,100%,5/5,100%,,
-Abnormal mandible morphology [HP:0000277],1/1,100%,1/1,100%,,
-Abnormal jaw morphology [HP:0030791],1/1,100%,1/1,100%,,
-Abnormal facial skeleton morphology [HP:0011821],1/1,100%,1/1,100%,,
-Abnormal oral cavity morphology [HP:0000163],1/1,100%,5/5,100%,,
-Abnormal oral morphology [HP:0031816],1/1,100%,5/5,100%,,
-Abnormality of the mouth [HP:0000153],1/1,100%,5/5,100%,,
-Abnormality of the face [HP:0000271],1/1,100%,5/5,100%,,
-Aplasia/Hypoplasia involving bones of the thorax [HP:0006711],2/2,100%,2/2,100%,,
-Abnormal thorax morphology [HP:0000765],5/5,100%,7/7,100%,,
-Persistent left superior vena cava [HP:0005301],0/0,0%,4/39,10%,,
-Abnormal superior vena cava morphology [HP:0025575],0/0,0%,4/4,100%,,
-Abnormal vena cava morphology [HP:0005345],0/0,0%,4/4,100%,,
-Abnormal venous morphology [HP:0002624],0/0,0%,4/4,100%,,
-Aplasia/hypoplasia of the humerus [HP:0006507],4/4,100%,8/8,100%,,
-Abnormality of the humerus [HP:0003063],4/4,100%,8/8,100%,,
-Abnormality of the upper arm [HP:0001454],4/4,100%,8/8,100%,,
-Right atrial enlargement [HP:0030718],0/0,0%,4/4,100%,,
-Abnormal right atrium morphology [HP:0025580],0/0,0%,4/4,100%,,
-Atrial septal dilatation [HP:0011995],0/0,0%,4/4,100%,,
-Left ventricular noncompaction cardiomyopathy [HP:0011664],1/5,20%,1/7,14%,,
-Noncompaction cardiomyopathy [HP:0012817],1/1,100%,1/1,100%,,
-Cardiomyopathy [HP:0001638],1/1,100%,1/1,100%,,
-Abnormal myocardium morphology [HP:0001637],1/1,100%,1/1,100%,,
-Hypoplastic scapulae [HP:0000882],1/1,100%,1/1,100%,,
-Aplasia/Hypoplasia of the scapulae [HP:0006713],1/1,100%,1/1,100%,,
-Abnormal scapula morphology [HP:0000782],1/1,100%,1/1,100%,,
-Abnormal toe morphology [HP:0001780],0/0,0%,1/1,100%,,
-Abnormal foot morphology [HP:0001760],0/0,0%,4/4,100%,,
-Abnormality of the lower limb [HP:0002814],0/0,0%,4/4,100%,,
-Abnormal toe phalanx morphology [HP:0010161],0/0,0%,1/1,100%,,
-Abnormality of the distal phalanges of the toes [HP:0010182],0/0,0%,1/1,100%,,
-Abnormal lower limb bone morphology [HP:0040069],0/0,0%,4/4,100%,,
-Aplasia involving bones of the extremities [HP:0009825],0/0,0%,3/3,100%,,
-Aplasia of the 1st metacarpal [HP:0010035],0/0,0%,3/3,100%,,
-Aplasia of the proximal phalanges of the hand [HP:0010242],0/0,0%,3/3,100%,,
-Aplasia/Hypoplasia of the proximal phalanges of the hand [HP:0009851],0/0,0%,3/3,100%,,
-Abnormal proximal phalanx morphology of the hand [HP:0009834],0/0,0%,3/3,100%,,
-Abnormal finger phalanx morphology [HP:0005918],0/0,0%,9/9,100%,,
-Aplasia/Hypoplasia of the phalanges of the hand [HP:0009767],0/0,0%,6/6,100%,,
-Aplasia of the phalanges of the hand [HP:0009802],0/0,0%,3/3,100%,,
-Aplasia involving bones of the upper limbs [HP:0009823],0/0,0%,3/3,100%,,
-Aplasia of metacarpal bones [HP:0010048],0/0,0%,3/3,100%,,
-Aplasia/Hypoplasia involving the metacarpal bones [HP:0005914],0/0,0%,4/4,100%,,
-Abnormal metacarpal morphology [HP:0005916],0/0,0%,4/4,100%,,
-Aplasia/Hypoplasia of the 1st metacarpal [HP:0010026],0/0,0%,4/4,100%,,
-Abnormal 1st metacarpal morphology [HP:0010009],0/0,0%,4/4,100%,,
-Aplasia/Hypoplasia of the phalanges of the thumb [HP:0009658],0/0,0%,4/4,100%,,
-Partial absence of thumb [HP:0009659],0/0,0%,3/3,100%,,
-Limited pronation/supination of forearm [HP:0006394],3/3,100%,2/2,100%,,
-Limited elbow movement [HP:0002996],4/4,100%,2/2,100%,,
-Abnormality of the elbow [HP:0009811],5/5,100%,2/2,100%,,
-Limitation of joint mobility [HP:0001376],4/4,100%,2/2,100%,,
-Mitral regurgitation [HP:0001653],2/2,100%,2/2,100%,,
-Atrioventricular valve regurgitation [HP:0034376],2/2,100%,7/7,100%,,
-Abnormal atrioventricular valve physiology [HP:0031650],2/2,100%,7/7,100%,,
-Abnormal heart valve physiology [HP:0031653],2/2,100%,7/7,100%,,
-Abnormal mitral valve physiology [HP:0031481],2/2,100%,2/2,100%,,
-Tricuspid regurgitation [HP:0005180],0/0,0%,5/5,100%,,
-Abnormal tricuspid valve physiology [HP:0031651],0/0,0%,5/5,100%,,
-Cleft soft palate [HP:0000185],0/0,0%,2/2,100%,,
-Abnormal soft palate morphology [HP:0100736],0/0,0%,2/2,100%,,
-Abnormal palate morphology [HP:0000174],0/0,0%,5/5,100%,,
-Cleft palate [HP:0000175],0/0,0%,2/2,100%,,
-Orofacial cleft [HP:0000202],0/0,0%,2/2,100%,,
-Craniofacial cleft [HP:5201015],0/0,0%,2/2,100%,,
-Sinus bradycardia [HP:0001688],1/1,100%,2/2,100%,,
-Abnormal electrophysiology of sinoatrial node origin [HP:0011702],1/1,100%,2/2,100%,,
-Arrhythmia [HP:0011675],1/1,100%,3/3,100%,,
-Bradycardia [HP:0001662],1/1,100%,2/2,100%,,
-Pectus excavatum [HP:0000767],2/2,100%,3/4,75%,,
-Abnormal sternum morphology [HP:0000766],2/2,100%,3/3,100%,,
-Deviation of the thumb [HP:0009603],2/2,100%,3/3,100%,,
-Deviation of finger [HP:0004097],2/2,100%,4/4,100%,,
-Deviation of the hand or of fingers of the hand [HP:0009484],2/2,100%,5/5,100%,,
-Abnormal skin morphology [HP:0011121],1/1,100%,0/0,0%,,
-Abnormality of the skin [HP:0000951],1/1,100%,0/0,0%,,
-Abnormality of the integument [HP:0001574],1/1,100%,0/0,0%,,
-Sinus venosus atrial septal defect [HP:0011567],1/1,100%,0/2,0%,,
-First degree atrioventricular block [HP:0011705],1/1,100%,1/23,4%,,
-Postaxial hand polydactyly [HP:0001162],0/0,0%,3/4,75%,,
-Postaxial polydactyly [HP:0100259],0/0,0%,3/3,100%,,
-Polydactyly [HP:0010442],0/0,0%,3/3,100%,,
-Abnormal 5th finger morphology [HP:0004207],0/0,0%,6/6,100%,,
-Hand polydactyly [HP:0001161],0/0,0%,3/3,100%,,
-Duplication of phalanx of hand [HP:0009997],0/0,0%,3/3,100%,,
-Duplication of hand bones [HP:0004275],0/0,0%,3/3,100%,,
-Duplication of bones involving the upper extremities [HP:0009142],0/0,0%,3/3,100%,,
-High palate [HP:0000218],0/0,0%,3/3,100%,,
-Short neck [HP:0000470],0/0,0%,3/3,100%,,
-Abnormal neck morphology [HP:0025668],0/0,0%,3/3,100%,,
-Abnormality of the neck [HP:0000464],0/0,0%,3/3,100%,,
-Abnormality of the cervical spine [HP:0003319],0/0,0%,3/3,100%,,
-Shield chest [HP:0000914],0/0,0%,3/3,100%,,
-Enlarged thorax [HP:0100625],0/0,0%,3/3,100%,,
-Abnormal rib cage morphology [HP:0001547],0/0,0%,5/5,100%,,
-Y-shaped metatarsals [HP:0010567],0/0,0%,3/3,100%,,
-Abnormal metatarsal morphology [HP:0001832],0/0,0%,3/3,100%,,
-11 pairs of ribs [HP:0000878],0/0,0%,2/2,100%,,
-Missing ribs [HP:0000921],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the ribs [HP:0006712],0/0,0%,2/2,100%,,
-Abnormal rib morphology [HP:0000772],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the 2nd finger [HP:0006264],0/0,0%,2/2,100%,,
-Abnormal 2nd finger morphology [HP:0004100],0/0,0%,2/2,100%,,
-Small hypothenar eminence [HP:0010487],0/0,0%,2/2,100%,,
-Abnormality of the hypothenar eminence [HP:0010486],0/0,0%,2/2,100%,,
-Abnormality of the musculature of the hand [HP:0001421],0/0,0%,2/2,100%,,
-Upper extremity joint dislocation [HP:0030310],2/2,100%,0/0,0%,,
-Joint dislocation [HP:0001373],2/2,100%,0/0,0%,,
-Amelia involving the upper limbs [HP:0009812],1/37,3%,0/114,0%,,
-Third degree atrioventricular block [HP:0001709],1/1,100%,0/22,0%,,
-Atrioventricular dissociation [HP:0011709],1/1,100%,0/22,0%,,
-Abnormal shoulder morphology [HP:0003043],1/1,100%,1/1,100%,,
-Proximal placement of thumb [HP:0009623],0/0,0%,3/3,100%,,
-Short middle phalanx of the 5th finger [HP:0004220],0/0,0%,2/2,100%,,
-Type A brachydactyly [HP:0009370],0/0,0%,2/2,100%,,
-Brachydactyly [HP:0001156],0/0,0%,2/2,100%,,
-Short 5th finger [HP:0009237],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the 5th finger [HP:0006262],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the middle phalanx of the 5th finger [HP:0009161],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the phalanges of the 5th finger [HP:0009376],0/0,0%,2/2,100%,,
-Abnormal 5th finger phalanx morphology [HP:0004213],0/0,0%,2/2,100%,,
-Abnormality of the middle phalanx of the 5th finger [HP:0004219],0/0,0%,2/2,100%,,
-Short middle phalanx of finger [HP:0005819],0/0,0%,2/2,100%,,
-Aplasia/Hypoplasia of the middle phalanges of the hand [HP:0009843],0/0,0%,2/2,100%,,
-Abnormal middle phalanx morphology of the hand [HP:0009833],0/0,0%,2/2,100%,,
-Short phalanx of finger [HP:0009803],0/0,0%,2/2,100%,,
-Common atrium [HP:0011565],0/38,0%,1/115,1%,,
-Unroofed coronary sinus [HP:0031297],0/38,0%,1/117,1%,,
-Clinodactyly of the 5th finger [HP:0004209],0/0,0%,2/2,100%,,
-Finger clinodactyly [HP:0040019],0/0,0%,2/2,100%,,
-Clinodactyly [HP:0030084],0/0,0%,2/2,100%,,
-Deviation of the 5th finger [HP:0009179],0/0,0%,2/2,100%,,
-Mitral valve prolapse [HP:0001634],1/1,100%,1/1,100%,,
-Abnormal mitral valve morphology [HP:0001633],1/1,100%,1/1,100%,,
-Abnormal atrioventricular valve morphology [HP:0006705],1/1,100%,1/1,100%,,
-Abnormal heart valve morphology [HP:0001654],1/1,100%,1/1,100%,,
-Short 1st metacarpal [HP:0010034],0/22,0%,1/45,2%,,
-Short phalanx of the thumb [HP:0009660],0/22,0%,1/45,2%,,
+Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825
+Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387
+Absent thumb [HP:0009777],18/100,18%,14/31,45%,0.021044138590779585,0.003713671516019927
+Atrioventricular block [HP:0001678],1/23,4%,2/2,100%,0.034,0.01
+Heart block [HP:0012722],1/23,4%,2/2,100%,0.034,0.01
+Patent ductus arteriosus [HP:0001643],6/40,15%,2/2,100%,0.09214092140921408,0.03252032520325203
+Secundum atrial septal defect [HP:0001684],23/55,42%,4/22,18%,0.1440020479198931,0.06544319142266644
+Triphalangeal thumb [HP:0001199],23/99,23%,13/32,41%,0.1440020479198931,0.06932119159387057
+Cardiac conduction abnormality [HP:0031546],15/37,41%,3/3,100%,0.1440020479198931,0.08259109311740892
+Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.1440020479198931,0.08470708701170182
+Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.6899307928951143,0.4666666666666667
+Short thumb [HP:0009778],25/69,36%,8/30,27%,0.6899307928951143,0.48700997145537483
+Absent radius [HP:0003974],9/43,21%,6/25,24%,1.0,0.7703831604944444
+Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0
+Hypoplasia of the radius [HP:0002984],34/75,45%,6/14,43%,1.0,1.0
+Hypoplasia of the ulna [HP:0003022],3/17,18%,2/10,20%,1.0,1.0
+Short humerus [HP:0005792],8/21,38%,4/9,44%,1.0,1.0
+Abnormal atrial septum morphology [HP:0011994],64/64,100%,20/20,100%,,
+Abnormal cardiac septum morphology [HP:0001671],89/89,100%,28/28,100%,,
+Abnormal heart morphology [HP:0001627],89/89,100%,30/30,100%,,
diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst
index 3eeca2bfc..5a8717f3b 100644
--- a/docs/user-guide/stats.rst
+++ b/docs/user-guide/stats.rst
@@ -122,13 +122,34 @@ We want to separate the patients into two groups: a group *with* a frameshift va
and a group *without* a frameshift variant, based on the functional annotation.
We will use the *MANE* transcript for the analysis:
+Building a genotype predicate is a two step process.
+First, we create a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`
+to test if the variant leads to a frameshift (in this case):
+
>>> from gpsea.model import VariantEffect
>>> from gpsea.analysis.predicate.genotype import VariantPredicates, boolean_predicate
>>> tx_id = 'NM_181486.4'
->>> gt_predicate = boolean_predicate(VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id))
->>> gt_predicate.get_question()
+>>> is_frameshift = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id)
+>>> is_frameshift.get_question()
'FRAMESHIFT_VARIANT on NM_181486.4'
+and then we choose the expected mode of inheritance to test. In case of *TBX5*,
+we expect the autosomal dominant mode of inheritance:
+
+>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate
+>>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift)
+>>> gt_predicate.display_question()
+'Which genotype group does the patient fit in: HOM_REF, HET'
+
+`gt_predicate` will assign the patients with no frameshift variant allele into `HOM_REF` group
+and the patients with one frameshift allele will be assigned into `HET` group.
+Note, any patient with 2 or more alleles will be *omitted* from the analysis.
+
+.. note::
+
+ Mode of inheritance testing is not the only way to dissect by a genotype.
+ See the :ref:`genotype-predicates` section for more info.
+
**Phenotype predicates**
@@ -155,8 +176,8 @@ including the *indirect* annotations whose presence is implied by the true path
We will use :ref: to test the association
between genotype and phenotype groups, as described previously.
->>> from gpsea.analysis.pcats.stats import ScipyFisherExact
->>> count_statistic = ScipyFisherExact()
+>>> from gpsea.analysis.pcats.stats import FisherExactTest
+>>> count_statistic = FisherExactTest()
FET will compute a p value for each genotype phenotype group.
@@ -229,18 +250,32 @@ We can learn more by showing the MTC filter report:
Genotype phenotype associations
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Last, let's explore the associations. This is a table of the tested HPO terms
-ordered by the corrected p value (Benjamini-Hochberg FDR):
+Last, let's explore the associations. The results include a table with all tested HPO terms
+ordered by the corrected p value (Benjamini-Hochberg FDR).
+Here we show the top 20 table rows:
>>> from gpsea.analysis.predicate import PatientCategories
>>> summary_df = result.summarize(hpo, PatientCategories.YES)
->>> summary_df.to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP
+>>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP
.. csv-table:: *TBX5* frameshift vs rest
:file: report/tbx5_frameshift.csv
:header-rows: 2
+The table shows that several HPO terms are significantly associated
+with presence of a heterozygous (`HET`) frameshift variant in *TBX5*.
+For example, `Ventricular septal defect `_
+was observed in 31/60 (52%) patients with a missense variant
+but it was observed in 19/19 (100%) patients with a frameshift variant.
+Fisher exact test computed a p value of `~0.000242`
+and the p value corrected by Benjamini-Hochberg procedure
+is `~0.00411`.
+
+The table includes all HPO terms of the cohort, including the terms that were not selected for testing
+and thus have no associated p value.
+
+
.. _phenotype-score-stats:
***************
@@ -374,7 +409,7 @@ The genotype predicate will bin the patient into two groups: a point mutation gr
... predicates=(point_mutation, lof_mutation),
... group_names=('Point', 'LoF'),
... )
->>> gt_predicate.get_question()
+>>> gt_predicate.display_question()
'Genotype group: Point, LoF'
diff --git a/src/gpsea/analysis/_api.py b/src/gpsea/analysis/_api.py
index ca7bd8e49..b2b7d41d7 100644
--- a/src/gpsea/analysis/_api.py
+++ b/src/gpsea/analysis/_api.py
@@ -189,7 +189,7 @@ def summarize(
# Column index: multiindex of counts and percentages for all genotype predicate groups
geno_idx = pd.MultiIndex.from_product(
iterables=(self._geno_predicate.get_categories(), ('Count', 'Percent')),
- names=(self._geno_predicate.get_question(), None),
+ names=(self._geno_predicate.get_question_base(), None),
)
# We'll fill this frame with data
diff --git a/src/gpsea/analysis/_config.py b/src/gpsea/analysis/_config.py
index 3fb67fc77..2c6d68c63 100644
--- a/src/gpsea/analysis/_config.py
+++ b/src/gpsea/analysis/_config.py
@@ -1,13 +1,15 @@
+import enum
import logging
import os
import typing
-import enum
+import warnings
+
import hpotk
from gpsea.config import get_cache_dir_path
from gpsea.model import Cohort
-from gpsea.preprocessing import ProteinMetadataService, UniprotProteinMetadataService, ProteinAnnotationCache, \
- ProtCachingMetadataService
+from gpsea.preprocessing import ProteinMetadataService
+from gpsea.preprocessing import configure_default_protein_metadata_service as backup_pms
from .mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter
from ._api import CohortAnalysis
from ._gp_analysis import FisherExactAnalyzer
@@ -264,8 +266,12 @@ def configure_cohort_analysis(
"""
if config is None:
config = CohortAnalysisConfiguration() # Use the default config
- cache_dir = _configure_cache_dir(cache_dir)
- protein_metadata_service = configure_default_protein_metadata_service(protein_source, cache_dir)
+
+ cache_path = get_cache_dir_path(cache_dir)
+ os.makedirs(cache_path, exist_ok=True)
+
+ cache_dir = str(cache_path)
+ protein_metadata_service = backup_pms(protein_source, cache_dir)
mtc_filter: PhenotypeMtcFilter
if config.mtc_strategy == MtcStrategy.HPO_MTC:
@@ -338,36 +344,9 @@ def configure_default_protein_metadata_service(
in current working directory under `.gpsea_cache/protein_cache`
and reach out to UNIPROT REST API if a cache entry is missing.
"""
- cache_dir = _configure_cache_dir(cache_dir)
- return _configure_protein_service(protein_fallback=protein_source, cache_dir=cache_dir)
-
-
-def _configure_protein_service(
- protein_fallback: str,
- cache_dir: str,
-) -> ProteinMetadataService:
- # (1) ProteinMetadataService
- # Setup fallback
- protein_fallback = _configure_fallback_protein_service(protein_fallback)
- # Setup protein metadata cache
- prot_cache_dir = os.path.join(cache_dir, 'protein_cache')
- os.makedirs(prot_cache_dir, exist_ok=True)
- prot_cache = ProteinAnnotationCache(prot_cache_dir)
- # Assemble the final protein metadata service
- protein_metadata_service = ProtCachingMetadataService(prot_cache, protein_fallback)
- return protein_metadata_service
-
-
-def _configure_fallback_protein_service(protein_fallback: str) -> ProteinMetadataService:
- if protein_fallback == 'UNIPROT':
- fallback1 = UniprotProteinMetadataService()
- else:
- raise ValueError(f'Unknown protein fallback annotator type {protein_fallback}')
- return fallback1
-
-
-def _configure_cache_dir(cache_dir: typing.Optional[str]) -> str:
- cache_path = get_cache_dir_path(cache_dir)
- os.makedirs(cache_path, exist_ok=True)
-
- return str(cache_path)
+ # TODO: remove at some point.
+ warnings.warn(
+ 'Use gpsea.preprocessing.configure_default_protein_metadata_service` instead',
+ DeprecationWarning, stacklevel=2,
+ )
+ return backup_pms(protein_source=protein_source, cache_dir=cache_dir)
diff --git a/src/gpsea/analysis/_gp_analysis.py b/src/gpsea/analysis/_gp_analysis.py
index 35d143053..f786049ce 100644
--- a/src/gpsea/analysis/_gp_analysis.py
+++ b/src/gpsea/analysis/_gp_analysis.py
@@ -63,11 +63,11 @@ def apply_predicates_on_patients(
data=0,
index=pd.Index(
data=ph_predicate.get_categories(),
- name=ph_predicate.get_question(),
+ name=ph_predicate.get_question_base(),
),
columns=pd.Index(
data=gt_predicate.get_categories(),
- name=gt_predicate.get_question(),
+ name=gt_predicate.get_question_base(),
),
)
diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py
index 0aca8cc7d..c52d10b9c 100644
--- a/src/gpsea/analysis/mtc_filter/_impl.py
+++ b/src/gpsea/analysis/mtc_filter/_impl.py
@@ -1,14 +1,15 @@
import abc
import typing
-from collections import defaultdict, deque
+from collections import deque
import hpotk
+from matplotlib import axis
+import numpy as np
import pandas as pd
-from ..predicate import PatientCategories, PatientCategory
from ..predicate.genotype import GenotypePolyPredicate
-from ..predicate.phenotype import P
+from ..predicate.phenotype import PhenotypePolyPredicate, P
class PhenotypeMtcResult:
@@ -76,50 +77,20 @@ class PhenotypeMtcFilter(typing.Generic[P], metaclass=abc.ABCMeta):
def filter(
self,
gt_predicate: GenotypePolyPredicate,
- phenotypes: typing.Sequence[P],
+ ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]],
counts: typing.Sequence[pd.DataFrame],
) -> typing.Sequence[PhenotypeMtcResult]:
"""
Test if the phenotype with given counts should be included in the downstream analysis.
:param gt_predicate: the predicate that produced the columns of the `count` data frame.
- :param phenotypes: the tested phenotypes.
+ :param ph_predicates: the phenotype predicates that produced the rows of the `counts` data frames.
:param counts: a sequence of 2D data frames for the tested phenotypes.
Each data frame corrresponds to a genotype/phenotype contingency matrix.
:returns: a sequence of filter results for the input `phenotypes`.
"""
pass
- @abc.abstractmethod
- def filter_terms_to_test(
- self,
- gt_predicate: GenotypePolyPredicate,
- n_usable: typing.Mapping[hpotk.TermId, int],
- all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame],
- ) -> typing.Tuple[
- typing.Mapping[hpotk.TermId, int],
- typing.Mapping[hpotk.TermId, pd.DataFrame],
- typing.Mapping[str, int],
- ]:
- """
- Decide which terms to pass through for statistical testing.
- The intention of this class is to reduce multiple testing burden by removing terms that are unlikely
- to lead to interesting statistical/analytical results.
-
- Args:
- gt_predicate: the predicate used to bin patients into groups along the genotype axis
- n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients
- that could be binned according to the used genotype/phenotype predicate.
- all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients
- in the i-th phenotype (rows) and j-th genotype (column) group
- Returns:
- a tuple with three items:
- - a mapping from :class:`hpotk.TermId` ->
- - a mapping from :class:`hpotk.TermId` ->
- - a mapping from a `str` with reason why a term was filtered out (e.g. *Skipping top level term*)
- """
- pass
-
@abc.abstractmethod
def filter_method_name(self) -> str:
"""
@@ -141,27 +112,11 @@ def __init__(self):
def filter(
self,
gt_predicate: GenotypePolyPredicate,
- phenotypes: typing.Sequence[P],
+ ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]],
counts: typing.Sequence[pd.DataFrame],
) -> typing.Sequence[PhenotypeMtcResult]:
# Always OK! 😏
- return tuple(self._ok for _ in phenotypes)
-
- def filter_terms_to_test(
- self,
- gt_predicate: GenotypePolyPredicate,
- n_usable: typing.Mapping[hpotk.TermId, int],
- all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame],
- ) -> typing.Tuple[
- typing.Mapping[hpotk.TermId, int],
- typing.Mapping[hpotk.TermId, pd.DataFrame],
- typing.Mapping[str, int],
- ]:
- """
- Use this implementation to test all available HPO terms.
- No HPO terms will be filtered out!
- """
- return n_usable, all_counts, {}
+ return tuple(self._ok for _ in ph_predicates)
def filter_method_name(self) -> str:
return "All HPO terms"
@@ -194,55 +149,17 @@ def __init__(
def filter(
self,
gt_predicate: GenotypePolyPredicate,
- phenotypes: typing.Sequence[P],
+ ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]],
counts: typing.Sequence[pd.DataFrame],
) -> typing.Sequence[PhenotypeMtcResult]:
results = []
- for phenotype in phenotypes:
- if phenotype in self._terms_to_test_set:
+ for predicate in ph_predicates:
+ if predicate.phenotype in self._terms_to_test_set:
results.append(self._ok)
else:
results.append(self._fail)
return tuple(results)
- def filter_terms_to_test(
- self,
- gt_predicate: GenotypePolyPredicate,
- n_usable: typing.Mapping[hpotk.TermId, int],
- all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame],
- ) -> typing.Tuple[
- typing.Mapping[hpotk.TermId, int],
- typing.Mapping[hpotk.TermId, pd.DataFrame],
- typing.Mapping[str, int],
- ]:
- """
- Remove terms that are not members of the specific set of HPO terms to be tested.
-
- Args:
- gt_predicate: the predicate used to bin patients into groups along the genotype axis
- n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients
- that could be binned according to the used genotype/phenotype predicate.
- all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients
- in the i-th phenotype (rows) and j-th genotype (column) group
-
- Returns:
- filtered versions of the two dictionaries above and dataframe with reasons for skipping
- """
- filtered_n_usable = {}
- filtered_all_counts = {}
- reason_for_filtering_out: typing.DefaultDict[str, int] = defaultdict(int)
-
- for term_id in all_counts.keys():
- if term_id not in self._terms_to_test_set:
- reason_for_filtering_out["Skipping non-specified term"] += 1
- continue
-
- # if we get here, then the term is a member of our list of terms to be tested.
- filtered_n_usable[term_id] = n_usable[term_id]
- filtered_all_counts[term_id] = all_counts[term_id]
-
- return filtered_n_usable, filtered_all_counts, reason_for_filtering_out
-
def filter_method_name(self) -> str:
return "Specified terms MTC filter"
@@ -356,13 +273,13 @@ def __init__(
def filter(
self,
gt_predicate: GenotypePolyPredicate,
- phenotypes: typing.Sequence[hpotk.TermId],
+ ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]],
counts: typing.Sequence[pd.DataFrame],
) -> typing.Sequence[PhenotypeMtcResult]:
+ phenotypes = [p.phenotype for p in ph_predicates]
p_to_idx = {p: i for i, p in enumerate(phenotypes)}
- results = [None for _ in range(len(phenotypes))]
- categories = tuple(gt_predicate.get_categories())
+ results: typing.MutableSequence[typing.Optional[PhenotypeMtcResult]] = [None for _ in range(len(phenotypes))]
for term_id in self._get_ordered_terms(phenotypes):
try:
idx = p_to_idx[term_id]
@@ -387,10 +304,12 @@ def filter(
# reason_for_filtering_out["Skipping non-target term"] += 1
# continue
+ ph_predicate = ph_predicates[idx]
contingency_matrix = counts[idx]
total = contingency_matrix.sum().sum()
max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
- contingency_matrix
+ contingency_matrix,
+ ph_predicate=ph_predicate,
)
if max_freq < self._hpo_term_frequency_filter:
reason = (
@@ -405,23 +324,26 @@ def filter(
results[idx] = PhenotypeMtcResult.fail(reason)
continue
- # todo -- similar for (3,2)
- if not HpoMtcFilter.some_cell_has_greater_than_one_count(contingency_matrix):
+ if not HpoMtcFilter.some_cell_has_greater_than_one_count(
+ counts=contingency_matrix,
+ ph_predicate=ph_predicate,
+ ):
reason = "Skipping term because no genotype has more than one observed HPO count"
results[idx] = PhenotypeMtcResult.fail(reason)
continue
-
+
elif HpoMtcFilter.genotypes_have_same_hpo_proportions(
contingency_matrix,
- categories,
+ gt_predicate=gt_predicate,
+ ph_predicate=ph_predicate,
):
reason = "Skipping term because all genotypes have same HPO observed proportions"
results[idx] = PhenotypeMtcResult.fail(reason)
continue
elif HpoMtcFilter.one_genotype_has_zero_hpo_observations(
- contingency_matrix,
- categories,
+ counts=contingency_matrix,
+ gt_predicate=gt_predicate,
):
reason = "Skipping term because one genotype had zero observations"
results[idx] = PhenotypeMtcResult.fail(reason)
@@ -447,175 +369,70 @@ def filter(
term_name = self._hpo.get_term_name(phenotypes[i])
missing.append(term_name)
- print('BLAAAAAA')
msg = 'Missing results for {}'.format(', '.join(missing))
raise ValueError(msg)
- return tuple(results)
-
- def filter_terms_to_test(
- self,
- gt_predicate: GenotypePolyPredicate,
- n_usable: typing.Mapping[hpotk.TermId, int],
- all_counts: typing.Mapping[hpotk.TermId, pd.DataFrame],
- ) -> typing.Tuple[
- typing.Mapping[hpotk.TermId, int],
- typing.Mapping[hpotk.TermId, pd.DataFrame],
- typing.Mapping[str, int],
- ]:
- """
- Args:
- gt_predicate: the predicate used to bin patients into groups along the genotype axis
- n_usable: a mapping from the :class:`hpotk.TermId` to an `int` with the count of patients
- that could be binned according to the used genotype/phenotype predicate.
- all_counts: a mapping from the :class:`hpotk.TermId` to counts of patients
- in the i-th phenotype (rows) and j-th genotype (column) group
- """
- filtered_n_usable = {}
- filtered_all_counts = {}
- reason_for_filtering_out = defaultdict(int)
- tested_counts_pf = defaultdict(
- pd.DataFrame
- ) # key is an HP id, value is a tuple with counts, i.e.,
-
- # iterate through the terms in the sorted order, starting from the leaves of the induced graph.
- categories = tuple(gt_predicate.get_categories())
- for term_id in self._get_ordered_terms(all_counts.keys()):
- if term_id in self._general_hpo_terms:
- reason_for_filtering_out["Skipping general term"] += 1
- continue
-
- if not self._hpo.graph.is_ancestor_of(
- hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY, term_id
- ):
- reason_for_filtering_out["Skipping non phenotype term"] += 1
- continue
-
- # get total number of observations
- if term_id not in all_counts:
- reason_for_filtering_out["Skipping non-target term"] += 1
- continue
-
- counts_frame = all_counts[term_id]
- total = counts_frame.sum().sum()
- max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
- counts_frame
- )
- if max_freq < self._hpo_term_frequency_filter:
- reason = (
- "Skipping term with maximum frequency "
- f"that was less than threshold {self._hpo_term_frequency_filter}"
- )
- reason_for_filtering_out[reason] += 1
- continue
-
- if counts_frame.shape == (2, 2) and total < 7:
- reason = f"Skipping term with only {total} observations (not powered for 2x2)"
- reason_for_filtering_out[reason] += 1
- continue
-
- # todo -- similar for (3,2)
- if not HpoMtcFilter.some_cell_has_greater_than_one_count(counts_frame):
- reason = "Skipping term because no genotype has more than one observed HPO count"
- reason_for_filtering_out[reason] += 1
- continue
-
- elif HpoMtcFilter.genotypes_have_same_hpo_proportions(
- counts_frame,
- categories,
- ):
- reason = "Skipping term because all genotypes have same HPO observed proportions"
- reason_for_filtering_out[reason] += 1
- continue
-
- elif HpoMtcFilter.one_genotype_has_zero_hpo_observations(
- counts_frame,
- categories,
- ):
- reason = "Skipping term because one genotype had zero observations"
- reason_for_filtering_out[reason] += 1
- continue
-
- for child_term_id in self._hpo.graph.get_children(term_id):
- if child_term_id in tested_counts_pf:
- if tested_counts_pf[child_term_id].equals(counts_frame):
- # TODO: should we make the match fuzzier by adding a tolerance instead of the exact matches?
- reason = "Child term with same counts previously tested"
- reason_for_filtering_out[reason] += 1
- continue
-
- # if we get here, then we include the test for `term_id`
- filtered_n_usable[term_id] = n_usable[term_id]
- filtered_all_counts[term_id] = all_counts[term_id]
-
- return filtered_n_usable, filtered_all_counts, reason_for_filtering_out
+ # Ignoring the type hint, because we checked the type match above.
+ return tuple(results) # type: ignore
def filter_method_name(self) -> str:
return "HPO MTC filter"
@staticmethod
- def get_number_of_observed_hpo_observations(counts_frame: pd.DataFrame) -> int:
- return counts_frame.loc[PatientCategories.YES].sum()
+ def get_number_of_observed_hpo_observations(
+ counts_frame: pd.DataFrame,
+ ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
+ ) -> int:
+ return counts_frame.loc[ph_predicate.present_phenotype_category].sum()
@staticmethod
- def get_maximum_group_observed_HPO_frequency(counts_frame: pd.DataFrame) -> float:
+ def get_maximum_group_observed_HPO_frequency(
+ counts_frame: pd.DataFrame,
+ ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
+ ) -> float:
"""
Returns:
The maximum frequency of observed HPO annotations across all genotypes.
"""
- df = counts_frame.loc[PatientCategories.YES] / (
- counts_frame.loc[PatientCategories.YES]
- + counts_frame.loc[PatientCategories.NO]
- )
- return df.max()
+ all_hpo_count_per_gt = counts_frame.sum()
+ if (all_hpo_count_per_gt == 0).all():
+ # Prevent division by zeros
+ return 0.
+
+ present_hpo_count_per_gt = counts_frame.loc[ph_predicate.present_phenotype_category]
+ return (present_hpo_count_per_gt / all_hpo_count_per_gt).max()
@staticmethod
def one_genotype_has_zero_hpo_observations(
counts: pd.DataFrame,
- gt_categories: typing.Sequence[PatientCategory],
+ gt_predicate: GenotypePolyPredicate,
):
- if not isinstance(counts, pd.DataFrame):
- raise ValueError(
- f"argument 'counts' must be pandas DataFrame but was {type(counts)}"
- )
-
- if counts.shape == (2, 2):
- assert len(gt_categories) == 2, \
- f"The counts frame is 2x2 but we found {len(gt_categories)} patient categories!"
- a, b = gt_categories
- return (
- counts.loc[:, a].sum() == 0 or counts.loc[:, b].sum() == 0
- )
- elif counts.shape == (2, 3):
- raise ValueError("(2,3) not yet implemented")
- else:
- raise ValueError(
- f"Did not recognize shape of counts matrix: {counts.shape}"
- )
+ return any(counts.loc[:, c].sum() == 0 for c in gt_predicate.get_categories())
@staticmethod
- def some_cell_has_greater_than_one_count(counts: pd.DataFrame) -> bool:
+ def some_cell_has_greater_than_one_count(
+ counts: pd.DataFrame,
+ ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
+ ) -> bool:
"""
If no genotype has more than one HPO count, we do not want to do a test. For instance, if MISSENSE has one
observed HPO and N excluded, and NOT MISSENSE has zero or one observed HPO, then we will skip the test
+
Args:
counts: pandas DataFrame with counts
+ ph_predicate: the phenotype predicate that produced the counts
Returns: true if at least one of the genotypes has more than one observed HPO count
"""
- if not isinstance(counts, pd.DataFrame):
- raise ValueError(
- f"argument 'counts' must be pandas DataFrame but was {type(counts)}"
- )
-
- return (counts.loc[PatientCategories.YES] > 1).any()
+ return (counts.loc[ph_predicate.present_phenotype_category] > 1).any()
@staticmethod
def genotypes_have_same_hpo_proportions(
counts: pd.DataFrame,
- gt_categories: typing.Sequence[PatientCategory],
- delta: float = 0.01,
+ gt_predicate: GenotypePolyPredicate,
+ ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
+ delta: float = 5e-4,
) -> bool:
"""
If each genotype has the same proportion of observed HPOs, then we do not want to do a test.
@@ -627,28 +444,20 @@ def genotypes_have_same_hpo_proportions(
Returns: true if the genotypes differ by more than `delta`
"""
- if not isinstance(counts, pd.DataFrame):
- raise ValueError(
- f"argument 'counts' must be pandas DataFrame but was {type(counts)}"
- )
+ numerators = np.array([
+ counts.loc[ph_predicate.present_phenotype_category, c] for c in gt_predicate.get_categories()
+ ])
+ denominators = np.array([
+ counts.loc[:, c].sum() for c in gt_predicate.get_categories()
+ ])
+
+ if np.any(denominators == 0):
+ return False
- if counts.shape == (2, 2):
- assert len(gt_categories) == 2, \
- f"The counts frame is 2x2 but we found {len(gt_categories)} patient categories!"
- a, b = gt_categories
- num1 = counts.loc[PatientCategories.YES, a]
- denom1 = counts.loc[:, a].sum()
- num2 = counts.loc[PatientCategories.YES, b]
- denom2 = counts.loc[:, b].sum()
- if denom1 == 0 or denom2 == 0:
- return False
- return abs(num1 / denom1 - num2 / denom2) < delta
- elif counts.shape == (2, 3):
- raise ValueError("(2,3) not implemented yet")
- else:
- raise ValueError(
- f"Did not recognize shape of counts matrix: {counts.shape}"
- )
+ ratios = numerators / denominators
+ mean_ratio = np.mean(ratios)
+ abs_diff = np.abs(ratios - mean_ratio)
+ return bool(np.all(abs_diff < delta))
def _get_ordered_terms(
self,
diff --git a/src/gpsea/analysis/pcats/__init__.py b/src/gpsea/analysis/pcats/__init__.py
index 6a82d92f1..59eceb977 100644
--- a/src/gpsea/analysis/pcats/__init__.py
+++ b/src/gpsea/analysis/pcats/__init__.py
@@ -21,9 +21,11 @@
from ._impl import MultiPhenotypeAnalysis, MultiPhenotypeAnalysisResult
from ._impl import DiseaseAnalysis
from ._impl import HpoTermAnalysis, HpoTermAnalysisResult
+from ._impl import apply_predicates_on_patients
__all__ = [
'MultiPhenotypeAnalysis', 'MultiPhenotypeAnalysisResult',
'DiseaseAnalysis',
'HpoTermAnalysis', 'HpoTermAnalysisResult',
+ 'apply_predicates_on_patients',
]
diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
index e4513fa0b..e8045d2d0 100644
--- a/src/gpsea/analysis/pcats/_impl.py
+++ b/src/gpsea/analysis/pcats/_impl.py
@@ -1,5 +1,6 @@
import abc
import math
+import os
import typing
from collections import Counter
@@ -63,11 +64,11 @@ def apply_predicates_on_patients(
data=0,
index=pd.Index(
data=ph_predicate.get_categories(),
- name=ph_predicate.get_question(),
+ name=ph_predicate.get_question_base(),
),
columns=pd.Index(
data=gt_predicate.get_categories(),
- name=gt_predicate.get_question(),
+ name=gt_predicate.get_question_base(),
),
)
@@ -197,7 +198,7 @@ def summarize(
gt_idx = pd.MultiIndex.from_product(
# TODO: fix the below
iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")),
- names=(self._gt_predicate.get_question(), None),
+ names=(self._gt_predicate.get_question_base(), None),
)
# We'll fill this frame with data
@@ -273,20 +274,87 @@ def __init__(
(e.g. Bonferroni MTC) or false discovery rate for the FDR procedures (e.g. Benjamini-Hochberg).
"""
assert isinstance(count_statistic, CountStatistic)
+ assert len(count_statistic.supports_shape) == 2, "The statistic must support 2D contingency tables"
self._count_statistic = count_statistic
self._mtc_correction = mtc_correction
assert isinstance(mtc_alpha, float) and 0. <= mtc_alpha <= 1.
self._mtc_alpha = mtc_alpha
- @abc.abstractmethod
def compare_genotype_vs_phenotypes(
self,
cohort: typing.Iterable[Patient],
gt_predicate: GenotypePolyPredicate,
pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
+ ) -> MultiPhenotypeAnalysisResult[P]:
+ # Check compatibility between the count statistic and predicate.
+ issues = MultiPhenotypeAnalysis._check_compatibility(
+ count_statistic=self._count_statistic,
+ gt_predicate=gt_predicate,
+ pheno_predicates=pheno_predicates,
+ )
+ if len(issues) != 0:
+ msg = os.linesep.join(issues)
+ raise ValueError(f'Cannot execute the analysis: {msg}')
+
+ return self._compute_result(
+ cohort=cohort,
+ gt_predicate=gt_predicate,
+ pheno_predicates=pheno_predicates,
+ )
+
+ @abc.abstractmethod
+ def _compute_result(
+ self,
+ cohort: typing.Iterable[Patient],
+ gt_predicate: GenotypePolyPredicate,
+ pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
) -> MultiPhenotypeAnalysisResult[P]:
pass
+ @staticmethod
+ def _check_compatibility(
+ count_statistic: CountStatistic,
+ gt_predicate: GenotypePolyPredicate,
+ pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
+ ) -> typing.Collection[str]:
+ # There should be 2 items due to check in `__init__`.
+ (pheno, geno) = count_statistic.supports_shape
+
+ issues = []
+ # Check phenotype
+ if isinstance(pheno, int):
+ pheno_accepted = (pheno,)
+ elif isinstance(pheno, typing.Sequence):
+ pheno_accepted = pheno
+ else:
+ issues.append('Cannot use a count statistic that does not check phenotypes')
+
+ pheno_failed = []
+ for i, ph_predicate in enumerate(pheno_predicates):
+ if ph_predicate.n_categorizations() not in pheno_accepted:
+ pheno_failed.append(i)
+ if len(pheno_failed) != 0:
+ issues.append(
+ 'Phenotype predicates {} are incompatible with the count statistic'.format(
+ ', '.join(str(i) for i in pheno_failed)
+ )
+ )
+
+ # Check genotype
+ if isinstance(geno, int):
+ geno_accepted = (geno,)
+ elif isinstance(geno, typing.Sequence):
+ geno_accepted = geno
+ elif pheno is None:
+ raise ValueError('Cannot use a count statistic that does not check genotypes')
+ else:
+ raise ValueError(f'Cannot use a count statistic that supports shape {pheno, geno}')
+
+ if gt_predicate.n_categorizations() not in geno_accepted:
+ issues.append('Genotype predicate is incompatible with the count statistic')
+
+ return issues
+
def _compute_nominal_pvals(
self,
n_usable: typing.Iterable[int],
@@ -380,7 +448,7 @@ def __hash__(self):
class DiseaseAnalysis(MultiPhenotypeAnalysis[hpotk.TermId]):
- def compare_genotype_vs_phenotypes(
+ def _compute_result(
self,
cohort: typing.Iterable[Patient],
gt_predicate: GenotypePolyPredicate,
@@ -513,7 +581,7 @@ def __init__(
assert isinstance(mtc_filter, PhenotypeMtcFilter)
self._mtc_filter = mtc_filter
- def compare_genotype_vs_phenotypes(
+ def _compute_result(
self,
cohort: typing.Iterable[Patient],
gt_predicate: GenotypePolyPredicate,
@@ -534,10 +602,12 @@ def compare_genotype_vs_phenotypes(
# 2 - Apply MTC filter and select p values to MTC
mtc_filter_results = self._mtc_filter.filter(
gt_predicate=gt_predicate,
- phenotypes=phenotypes,
+ ph_predicates=pheno_predicates,
counts=all_counts,
)
mtc_mask = np.array([r.is_passed() for r in mtc_filter_results])
+ if not mtc_mask.any():
+ raise ValueError("No phenotypes are left for the analysis after MTC filtering step")
# 3 - Compute nominal p values
pvals = np.full(shape=(len(n_usable),), fill_value=np.nan)
diff --git a/src/gpsea/analysis/pcats/stats/__init__.py b/src/gpsea/analysis/pcats/stats/__init__.py
index 4d7e05437..a535129d4 100644
--- a/src/gpsea/analysis/pcats/stats/__init__.py
+++ b/src/gpsea/analysis/pcats/stats/__init__.py
@@ -1,5 +1,5 @@
-from ._stats import CountStatistic, ScipyFisherExact, PythonMultiFisherExact
+from ._stats import CountStatistic, FisherExactTest
__all__ = [
- 'CountStatistic', 'ScipyFisherExact', 'PythonMultiFisherExact',
+ 'CountStatistic', 'FisherExactTest',
]
diff --git a/src/gpsea/analysis/pcats/stats/_stats.py b/src/gpsea/analysis/pcats/stats/_stats.py
index 8df07fdcd..981176ee1 100644
--- a/src/gpsea/analysis/pcats/stats/_stats.py
+++ b/src/gpsea/analysis/pcats/stats/_stats.py
@@ -1,5 +1,7 @@
import abc
import math
+import typing
+
from decimal import Decimal
@@ -14,9 +16,45 @@ class CountStatistic(metaclass=abc.ABCMeta):
`CountStatistic` calculates a p value for a contingency table
produced by a pair of discrete random variables.
- The `counts` table is usually `2x2` or `2x3`.
+
+ Supports shape
+ ^^^^^^^^^^^^^^
+
+ `CountStatistic` takes the counts in form of a data frame,
+ and some statistics impose additional requirements on the frame shape.
+ For instance, GPSEA's implementation of the Fisher exact test
+ can compare counts in a ``(2, 2)`` or ``(2, 3)`` arrays
+ but χ2 test can test an ``(m, n)`` array.
+
+ It is important to check that a genotype/phenotype predicate produces
+ the number of groups which the statistic can test.
+
+ The :attr:`supports_shape` returns a sequence with requirements
+ on the shape of the data array/frame. The sequence includes
+ the number of
+
+ Examples
+ ********
+
+ +------------------------+-------------------------+------------------+
+ | Test | Array shape | `supports_shape` |
+ +========================+=========================+==================+
+ | Fisher Exact Test | ``(2, [2, 3])`` | ``(2, [2,3])`` |
+ +------------------------+-------------------------+------------------+
+ | χ2 | ``(*, *)`` | ``(None, None)`` |
+ +------------------------+-------------------------+------------------+
"""
+ @property
+ @abc.abstractmethod
+ def supports_shape(
+ self,
+ ) -> typing.Sequence[typing.Union[int, typing.Sequence[int], None]]:
+ """
+ Get a sequence of the supported shapes.
+ """
+ pass
+
@abc.abstractmethod
def compute_pval(
self,
@@ -25,47 +63,35 @@ def compute_pval(
pass
-class ScipyFisherExact(CountStatistic):
+class FisherExactTest(CountStatistic):
"""
- `ScipyFisherExact` performs Fisher Exact Test on a `2x2` contingency table.
+ `FisherExactTest` performs Fisher Exact Test on a `2x2` or `2x3` contingency table.
- The class is a thin wrapper around Scipy :func:`~scipy.stats.fisher_exact` function.
- The two-sided :math:`H_1` is considered.
+ The `2x2` version is a thin wrapper around Scipy :func:`~scipy.stats.fisher_exact` function,
+ while the `2x3` variant is implemented in Python.
+ In both variants, the two-sided :math:`H_1` is considered.
"""
+
+ def __init__(self):
+ self._shape = (2, (2, 3))
- def compute_pval(
+ @property
+ def supports_shape(
self,
- counts: pd.DataFrame,
- ) -> float:
- assert counts.shape == (
- 2,
- 2,
- ), f"Cannot run Fisher exact test on an array with {counts.shape} shape"
- _, pval = scipy.stats.fisher_exact(counts.values, alternative="two-sided")
- return pval
-
-
-class PythonMultiFisherExact(CountStatistic):
- """
- `PythonMultiFisherExact` is a Python implementation of Fisher Exact Test to compute
- p value for a `2x3` contingency table.
- """
+ ) -> typing.Sequence[typing.Union[int, typing.Sequence[int], None]]:
+ return self._shape
def compute_pval(
self,
counts: pd.DataFrame,
) -> float:
- PythonMultiFisherExact._check_input(counts)
- return self._fisher_exact(counts.values)
-
- @staticmethod
- def _check_input(a: pd.DataFrame):
- if not isinstance(a, pd.DataFrame):
- raise ValueError(f"Expected a pandas DataFrame but got {type(a)}")
- if not a.shape == (2, 3):
- raise ValueError(f"Shape of the array must be (2, 3) but got {a.shape}")
- if np.array_equal(a.values, np.zeros_like(a)):
- raise ValueError("Data frame is all zeros, cannot run analysis")
+ if counts.shape == (2, 2):
+ _, pval = scipy.stats.fisher_exact(counts.values, alternative="two-sided")
+ return pval
+ elif counts.shape == (2, 3):
+ return self._fisher_exact(counts.values)
+ else:
+ raise ValueError(f'Unsupported counts shape {counts.shape}')
def _fisher_exact(
self,
diff --git a/src/gpsea/analysis/pcats/stats/_test__stats.py b/src/gpsea/analysis/pcats/stats/_test__stats.py
index aa2543063..c8db0b164 100644
--- a/src/gpsea/analysis/pcats/stats/_test__stats.py
+++ b/src/gpsea/analysis/pcats/stats/_test__stats.py
@@ -2,14 +2,14 @@
import pandas as pd
import pytest
-from ._stats import PythonMultiFisherExact
+from ._stats import FisherExactTest
class TestPythonMultiFisherExact:
@pytest.fixture
- def fisher_exact(self) -> PythonMultiFisherExact:
- return PythonMultiFisherExact()
+ def fisher_exact(self) -> FisherExactTest:
+ return FisherExactTest()
@pytest.mark.parametrize(
"counts, expected",
@@ -24,10 +24,9 @@ def test_compute_pval(
self,
counts,
expected,
- fisher_exact: PythonMultiFisherExact,
+ fisher_exact: FisherExactTest,
):
contingency_matrix = pd.DataFrame(np.array(counts, dtype=np.int64))
final_pval = fisher_exact.compute_pval(contingency_matrix)
assert final_pval == pytest.approx(expected)
-
diff --git a/src/gpsea/analysis/predicate/_api.py b/src/gpsea/analysis/predicate/_api.py
index 8c62e1910..534eca18d 100644
--- a/src/gpsea/analysis/predicate/_api.py
+++ b/src/gpsea/analysis/predicate/_api.py
@@ -1,6 +1,8 @@
import abc
import typing
+from collections import Counter
+
import hpotk.util
from gpsea.model import Patient
@@ -136,9 +138,7 @@ class PolyPredicate(typing.Generic[C], metaclass=abc.ABCMeta):
and *exhaustive* - the groups must cover all possible scenarios.
However, if the patient cannot be assigned into any meaningful category, `None` can be returned.
-
- Note, that `PolyPredicate` must be used on a happy path - testing a patient must inherently make sense.
- Predicate will *not* check if, for instance, the patient variants are compatible with a certain mode of inheritance.
+ As a rule of thumb, returning `None` will exclude the patient from the analysis.
"""
@abc.abstractmethod
@@ -154,6 +154,12 @@ def get_categories(self) -> typing.Iterator[PatientCategory]:
"""
return (c.category for c in self.get_categorizations())
+ def get_category_names(self) -> typing.Iterator[str]:
+ """
+ Get an iterator with names of the :class:`PatientCategory` items that the predicate can produce.
+ """
+ return (cat.name for cat in self.get_categories())
+
def get_category(
self,
cat_id: int,
@@ -182,12 +188,21 @@ def get_category_name(
return self.get_category(cat_id).name
@abc.abstractmethod
- def get_question(self) -> str:
+ def get_question_base(self) -> str:
"""
Prepare a `str` with the question the predicate can answer.
"""
pass
+ def display_question(self) -> str:
+ """
+ Prepare the question which the predicate can answer.
+
+ The question includes the question base and the category names
+ """
+ cat_names = ', '.join(self.get_category_names())
+ return f'{self.get_question_base()}: {cat_names}'
+
@abc.abstractmethod
def test(self, patient: Patient) -> typing.Optional[C]:
"""
@@ -210,3 +225,28 @@ def _check_patient(patient: Patient):
"""
if not isinstance(patient, Patient):
raise ValueError(f"patient must be type Patient but was type {type(patient)}")
+
+ @staticmethod
+ def _check_categorizations(
+ categorizations: typing.Sequence[Categorization],
+ ) -> typing.Sequence[str]:
+ """
+ Check that the categorizations meet the requirements.
+
+ The requirements include:
+
+ * the `cat_id` must be unique within the predicate
+ """
+ issues = []
+ # There must be at least one category
+
+ # `cat_id` must be unique for a predicate!
+ cat_id_counts = Counter()
+ for c in categorizations:
+ cat_id_counts[c.category.cat_id] += 1
+
+ for cat_id, count in cat_id_counts.items():
+ if count > 1:
+ issues.append(f'`cat_id` {cat_id} is present {count}>1 times')
+
+ return issues
diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py
index a8a2ba919..86dd261a2 100644
--- a/src/gpsea/analysis/predicate/genotype/__init__.py
+++ b/src/gpsea/analysis/predicate/genotype/__init__.py
@@ -1,12 +1,14 @@
from ._api import GenotypePolyPredicate
from ._api import VariantPredicate
from ._counter import AlleleCounter
-from ._gt_predicates import boolean_predicate, groups_predicate, recessive_predicate
+from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, recessive_predicate
+from ._gt_predicates import ModeOfInheritancePredicate
from ._variant import VariantPredicates, ProteinPredicates
__all__ = [
'GenotypePolyPredicate',
- 'boolean_predicate', 'groups_predicate', 'recessive_predicate',
+ 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'recessive_predicate',
+ 'ModeOfInheritancePredicate',
'AlleleCounter', 'VariantPredicate',
'VariantPredicates', 'ProteinPredicates',
]
diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
index 3f54a7e02..43a39594a 100644
--- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
+++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py
@@ -1,8 +1,12 @@
+import dataclasses
+import enum
import typing
-from gpsea.model import Patient
+from collections import defaultdict
-from .._api import Categorization, PatientCategories
+from gpsea.model import Patient, Sex
+
+from .._api import Categorization, PatientCategory, PatientCategories
from ._api import GenotypePolyPredicate, RecessiveGroupingPredicate
from ._api import VariantPredicate
from ._counter import AlleleCounter
@@ -30,9 +34,12 @@ def get_categorizations(self) -> typing.Sequence[Categorization]:
:attr:`AlleleCountingGenotypeBooleanPredicate.NO`
or :class:`AlleleCountingGenotypeBooleanPredicate.YES` category.
"""
- return AlleleCountingGenotypeBooleanPredicate.YES, AlleleCountingGenotypeBooleanPredicate.NO
+ return (
+ AlleleCountingGenotypeBooleanPredicate.YES,
+ AlleleCountingGenotypeBooleanPredicate.NO,
+ )
- def get_question(self) -> str:
+ def get_question_base(self) -> str:
return self._allele_counter.get_question()
def test(self, patient: Patient) -> typing.Optional[Categorization]:
@@ -49,15 +56,17 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]:
)
def __eq__(self, value: object) -> bool:
- return isinstance(value, AlleleCountingGenotypeBooleanPredicate) \
+ return (
+ isinstance(value, AlleleCountingGenotypeBooleanPredicate)
and self._allele_counter == value._allele_counter
-
+ )
+
def __hash__(self) -> int:
return hash((self._allele_counter,))
-
+
def __str__(self) -> str:
- return f'AlleleCountingGenotypeBooleanPredicate(allele_counter={self._allele_counter})'
-
+ return f"AlleleCountingGenotypeBooleanPredicate(allele_counter={self._allele_counter})"
+
def __repr__(self) -> str:
return str(self)
@@ -83,13 +92,12 @@ def __init__(
):
self._counters = tuple(counters)
self._categorizations = tuple(categorizations)
- group_names = ', '.join(c.category.name for c in self._categorizations)
- self._question = f'Genotype group: {group_names}'
+ self._question = "Genotype group"
def get_categorizations(self) -> typing.Sequence[Categorization]:
return self._categorizations
- def get_question(self) -> str:
+ def get_question_base(self) -> str:
return self._question
def test(self, patient: Patient) -> typing.Optional[Categorization]:
@@ -108,20 +116,29 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]:
return None
def __eq__(self, value: object) -> bool:
- return isinstance(value, AlleleCountingGroupsPredicate) \
- and self._counters == value._counters \
+ return (
+ isinstance(value, AlleleCountingGroupsPredicate)
+ and self._counters == value._counters
and self._categorizations == value._categorizations
+ )
def __hash__(self) -> int:
- return hash((self._counters, self._categorizations,))
+ return hash(
+ (
+ self._counters,
+ self._categorizations,
+ )
+ )
def __str__(self) -> str:
- return self.get_question()
+ return self.get_question_base()
def __repr__(self) -> str:
- return 'AlleleCountingGroupsPredicate(' \
- + 'counters={self._counters}, ' \
- + 'categorizations={self._categorizations})'
+ return (
+ "AlleleCountingGroupsPredicate("
+ + "counters={self._counters}, "
+ + "categorizations={self._categorizations})"
+ )
def groups_predicate(
@@ -134,6 +151,8 @@ def groups_predicate(
The genotype groups *should* not overlap.
In case of an overlap, the patient will be assigned into no group (`None`).
+ See the :ref:`groups-predicate` section for an example.
+
:param predicates: an iterable with at least 2 variant predicates to determine a genotype group.
:param group_names: an iterable with group names. The number of group names must match the number of predicates.
"""
@@ -141,9 +160,10 @@ def groups_predicate(
predicates = tuple(predicates)
group_names = tuple(group_names)
- assert len(predicates) >= 2, f'We need at least 2 predicates: {len(predicates)}'
- assert len(predicates) == len(group_names), \
- f'The number of group names must match the number of predicates: {len(group_names)}!={len(predicates)}'
+ assert len(predicates) >= 2, f"We need at least 2 predicates: {len(predicates)}"
+ assert len(predicates) == len(
+ group_names
+ ), f"The number of group names must match the number of predicates: {len(group_names)}!={len(predicates)}"
# Then, prepare the counters and categorizations.
counters = [AlleleCounter(predicate=predicate) for predicate in predicates]
@@ -164,18 +184,111 @@ def groups_predicate(
)
+class FilteringGenotypePolyPredicate(GenotypePolyPredicate):
+ # NOT PART OF THE PUBLIC API
+
+ @staticmethod
+ def create(
+ predicate: "GenotypePolyPredicate",
+ targets: typing.Collection[Categorization],
+ ) -> "FilteringGenotypePolyPredicate":
+ # At least 2 target categorizations must be provided
+ if len(targets) <= 1:
+ raise ValueError(f'At least 2 target categorizations must be provided but got {len(targets)}')
+
+ good_boys = tuple(isinstance(cat, Categorization) for cat in targets)
+ if not all(good_boys):
+ offenders = ', '.join(
+ str(i)
+ for i, is_instance
+ in enumerate(good_boys) if not is_instance
+ )
+ raise ValueError(f'The targets at following indices are not categorizations: [{offenders}]')
+
+ # All `allowed` categorizations must in fact be present in the `base` predicate.
+ cats_are_in_fact_present = tuple(cat in predicate.get_categorizations() for cat in targets)
+ if not all(cats_are_in_fact_present):
+ missing = ', '.join(
+ c.category.name
+ for c, is_present
+ in zip(targets, cats_are_in_fact_present) if not is_present
+ )
+ raise ValueError(f'Some from the categories are not present: {missing}')
+
+ if len(targets) == predicate.n_categorizations():
+ raise ValueError(
+ f'It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations '
+ f'with the same number ({len(targets)}) of targets'
+ )
+
+ return FilteringGenotypePolyPredicate(
+ predicate=predicate,
+ allowed=targets,
+ )
+
+ def __init__(
+ self,
+ predicate: "GenotypePolyPredicate",
+ allowed: typing.Iterable[Categorization],
+ ):
+ self._predicate = predicate
+ self._allowed = tuple(allowed)
+
+ def get_categorizations(self) -> typing.Sequence[Categorization]:
+ return self._allowed
+
+ def get_question_base(self) -> str:
+ return self._predicate.get_question_base()
+
+ def test(self, patient: Patient) -> typing.Optional[Categorization]:
+ cat = self._predicate.test(patient)
+ if cat in self._allowed:
+ return cat
+ else:
+ return None
+
+ def __repr__(self):
+ return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})"
+
+
+def filtering_predicate(
+ predicate: GenotypePolyPredicate,
+ targets: typing.Collection[Categorization],
+) -> GenotypePolyPredicate:
+ """
+ Filtering predicate applies the base `predicate` but only returns the categorizations
+ from the provided `targets` collection.
+
+ This can be useful if only some of the categorizations are interesting.
+ For instance, if we only seek to compare the differences between heterozygous and hemizygous variants,
+ but the predicate also bins the patients into homozygous reference, and biallelic alt genotype groups.
+
+ See the :ref:`filtering-predicate` section for an example.
+
+ The `predicate` is checked for being able to produce the all items in `targets`
+ and the `targets` must include at least 2 categorizations.
+
+ :param predicate: the base predicate whose categorizations are subject to filteration.
+ :param targets: the categorizations to retain
+ """
+ return FilteringGenotypePolyPredicate.create(
+ predicate=predicate,
+ targets=targets,
+ )
+
+
class AlleleCountingRecessivePredicate(RecessiveGroupingPredicate):
# NOT PART OF THE PUBLIC API
# TODO: this predicate is a bit weird and I think it should eventually go away.
# Therefore, I do not write any tests at this point.
def __init__(
- self,
+ self,
allele_counter: AlleleCounter,
):
self._allele_counter = allele_counter
- def get_question(self) -> str:
+ def get_question_base(self) -> str:
return self._allele_counter.get_question()
def test(self, patient: Patient) -> typing.Optional[Categorization]:
@@ -190,16 +303,21 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]:
return RecessiveGroupingPredicate.BOTH
else:
return None
-
+
def __eq__(self, value: object) -> bool:
- return isinstance(value, AlleleCountingRecessivePredicate) and self._allele_counter == value._allele_counter
-
+ return (
+ isinstance(value, AlleleCountingRecessivePredicate)
+ and self._allele_counter == value._allele_counter
+ )
+
def __hash__(self) -> int:
return hash((self._allele_counter,))
-
+
def __str__(self) -> str:
- return f'AlleleCountingRecessivePredicate(allele_counter={self._allele_counter})'
-
+ return (
+ f"AlleleCountingRecessivePredicate(allele_counter={self._allele_counter})"
+ )
+
def __repr__(self) -> str:
return str(self)
@@ -209,8 +327,8 @@ def recessive_predicate(
) -> GenotypePolyPredicate:
"""
Create a recessive grouping predicate from given `variant_predicate`
- to bin the patient into :class:`RecessiveGroupingPredicate.NEITHER`,
- :class:`RecessiveGroupingPredicate.ONE`, or :class:`RecessiveGroupingPredicate.BOTH`,
+ to bin the patient into :class:`RecessiveGroupingPredicate.NEITHER`,
+ :class:`RecessiveGroupingPredicate.ONE`, or :class:`RecessiveGroupingPredicate.BOTH`,
depending on the number of variant alleles matching the variant predicate.
The patient is assigned into a group in the following manner:
@@ -221,3 +339,385 @@ def recessive_predicate(
"""
allele_counter = AlleleCounter(predicate=variant_predicate)
return AlleleCountingRecessivePredicate(allele_counter=allele_counter)
+
+
+@dataclasses.dataclass(eq=True, frozen=True)
+class GenotypeGroup:
+ allele_count: int
+ sex: typing.Optional[Sex]
+ categorization: Categorization
+
+
+class MendelianInheritanceAspect(enum.Enum):
+ AUTOSOMAL = 0
+ """
+ Related to chromosomes that do *not* determine the sex of an individual.
+ """
+
+ GONOSOMAL = 1
+ """
+ Related to chromosomes that determine the sex of an individual.
+ """
+
+ MITOCHONDRIAL = 2
+ """
+ Related to mitochondrial DNA.
+ """
+
+
+class ModeOfInheritanceInfo:
+
+ # NOT PART OF THE PUBLIC API!!!
+
+ HOM_REF = Categorization(
+ PatientCategory(
+ cat_id=0, name="HOM_REF", description="Homozygous reference",
+ ),
+ )
+ HET = Categorization(
+ PatientCategory(
+ cat_id=1, name="HET", description="Heterozygous",
+ ),
+ )
+ BIALLELIC_ALT = Categorization(
+ PatientCategory(
+ cat_id=2, name="BIALLELIC_ALT",
+ description="Homozygous alternate or compound heterozygous",
+ ),
+ )
+ HEMI = Categorization(
+ PatientCategory(
+ cat_id=3, name="HEMI", description="Hemizygous",
+ ),
+ )
+
+ @staticmethod
+ def autosomal_dominant() -> "ModeOfInheritanceInfo":
+ groups = (
+ GenotypeGroup(
+ allele_count=0,
+ sex=None,
+ categorization=ModeOfInheritanceInfo.HOM_REF,
+ ),
+ GenotypeGroup(
+ allele_count=1,
+ sex=None,
+ categorization=ModeOfInheritanceInfo.HET,
+ ),
+ )
+ return ModeOfInheritanceInfo(
+ mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL,
+ groups=groups,
+ )
+
+ @staticmethod
+ def autosomal_recessive() -> "ModeOfInheritanceInfo":
+ groups = (
+ GenotypeGroup(
+ allele_count=0,
+ sex=None,
+ categorization=ModeOfInheritanceInfo.HOM_REF,
+ ),
+ GenotypeGroup(
+ allele_count=1,
+ sex=None,
+ categorization=ModeOfInheritanceInfo.HET,
+ ),
+ GenotypeGroup(
+ allele_count=2,
+ sex=None,
+ categorization=ModeOfInheritanceInfo.BIALLELIC_ALT,
+ ),
+ )
+ return ModeOfInheritanceInfo(
+ mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL,
+ groups=groups,
+ )
+
+ @staticmethod
+ def x_dominant() -> "ModeOfInheritanceInfo":
+ groups = (
+ GenotypeGroup(
+ allele_count=0,
+ sex=None,
+ categorization=ModeOfInheritanceInfo.HOM_REF,
+ ),
+ GenotypeGroup(
+ allele_count=1,
+ sex=None,
+ categorization=ModeOfInheritanceInfo.HET,
+ ),
+ )
+ return ModeOfInheritanceInfo(
+ mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL,
+ groups=groups,
+ )
+
+ @staticmethod
+ def x_recessive() -> "ModeOfInheritanceInfo":
+ groups = (
+ GenotypeGroup(
+ allele_count=0,
+ sex=None,
+ categorization=ModeOfInheritanceInfo.HOM_REF,
+ ),
+ GenotypeGroup(
+ allele_count=1,
+ sex=Sex.FEMALE,
+ categorization=ModeOfInheritanceInfo.HET,
+ ),
+ GenotypeGroup(
+ allele_count=2,
+ sex=Sex.FEMALE,
+ categorization=ModeOfInheritanceInfo.BIALLELIC_ALT,
+ ),
+ GenotypeGroup(
+ allele_count=1,
+ sex=Sex.MALE,
+ categorization=ModeOfInheritanceInfo.HEMI,
+ ),
+ )
+
+ return ModeOfInheritanceInfo(
+ mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL,
+ groups=groups,
+ )
+
+ def __init__(
+ self,
+ mendelian_inheritance_aspect: MendelianInheritanceAspect,
+ groups: typing.Iterable[GenotypeGroup],
+ ):
+ # We want this to be hashable but also keep a non-hashable dict
+ # as a field. Therefore, we pre-compute the hash manually.
+ # The correctness depends on two default dicts with same keys and values
+ # comparing equal.
+ hash_value = 17
+ assert isinstance(mendelian_inheritance_aspect, MendelianInheritanceAspect)
+ self._aspect = mendelian_inheritance_aspect
+
+ hash_value += 31 * hash(self._aspect)
+
+ self._groups = defaultdict(list)
+ for group in groups:
+ assert isinstance(group, GenotypeGroup)
+ self._groups[group.allele_count].append(group)
+ hash_value += 13 * hash(group)
+
+ self._hash = hash_value
+
+ @property
+ def groups(self) -> typing.Iterator[GenotypeGroup]:
+ # Flatten `values()` which is an iterable of lists.
+ return (group for meta_group in self._groups.values() for group in meta_group)
+
+ @property
+ def mendelian_inheritance_aspect(self) -> MendelianInheritanceAspect:
+ return self._aspect
+
+ def get_groups_for_allele_count(
+ self,
+ allele_count: int,
+ ) -> typing.Sequence[GenotypeGroup]:
+ try:
+ return self._groups[allele_count]
+ except KeyError:
+ # No group for this allele count is OK
+ return ()
+
+ def is_autosomal(self) -> bool:
+ return self._aspect == MendelianInheritanceAspect.AUTOSOMAL
+
+ def is_gonosomal(self) -> bool:
+ return self._aspect == MendelianInheritanceAspect.GONOSOMAL
+
+ def is_mitochondrial(self) -> bool:
+ return self._aspect == MendelianInheritanceAspect.MITOCHONDRIAL
+
+ def __eq__(self, value: object) -> bool:
+ return (
+ isinstance(value, ModeOfInheritanceInfo)
+ and self._aspect == value._aspect
+ and self._groups == value._groups
+ )
+
+ def __hash__(self) -> int:
+ return self._hash
+
+ def __str__(self) -> str:
+ return f"ModeOfInheritanceInfo(aspect={self._aspect}, groups={self._groups})"
+
+ def __repr__(self) -> str:
+ return str(self)
+
+
+class ModeOfInheritancePredicate(GenotypePolyPredicate):
+ """
+ `ModeOfInheritancePredicate` assigns an individual into a group based on compatibility with
+ the selected mode of inheritance.
+ """
+
+ @staticmethod
+ def autosomal_dominant(
+ variant_predicate: VariantPredicate,
+ ) -> "ModeOfInheritancePredicate":
+ """
+ Create a predicate that assigns the patient either
+ into homozygous reference or heterozygous
+ group in line with the autosomal dominant mode of inheritance.
+
+ :param variant_predicate: a predicate for choosing the variants for testing.
+ """
+ return ModeOfInheritancePredicate.from_moi_info(
+ variant_predicate=variant_predicate,
+ mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(),
+ )
+
+ @staticmethod
+ def autosomal_recessive(
+ variant_predicate: VariantPredicate,
+ ) -> "ModeOfInheritancePredicate":
+ """
+ Create a predicate that assigns the patient either into
+ homozygous reference, heterozygous, or biallelic alternative allele
+ (homozygous alternative or compound heterozygous)
+ group in line with the autosomal recessive mode of inheritance.
+
+ :param variant_predicate: a predicate for choosing the variants for testing.
+ """
+ return ModeOfInheritancePredicate.from_moi_info(
+ variant_predicate=variant_predicate,
+ mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(),
+ )
+
+ @staticmethod
+ def x_dominant(
+ variant_predicate: VariantPredicate,
+ ) -> "ModeOfInheritancePredicate":
+ """
+ Create a predicate that assigns the patient either into
+ homozygous reference or heterozygous
+ group in line with the X-linked dominant mode of inheritance.
+
+ :param variant_predicate: a predicate for choosing the variants for testing.
+ """
+ return ModeOfInheritancePredicate.from_moi_info(
+ variant_predicate=variant_predicate,
+ mode_of_inheritance_data=ModeOfInheritanceInfo.x_dominant(),
+ )
+
+ @staticmethod
+ def x_recessive(
+ variant_predicate: VariantPredicate,
+ ) -> "ModeOfInheritancePredicate":
+ """
+ Create a predicate that assigns the patient either into
+ homozygous reference, heterozygous, biallelic alternative allele
+ (homozygous alternative or compound heterozygous), or hemizygous
+ group in line with the X-linked recessive mode of inheritance.
+
+ :param variant_predicate: a predicate for choosing the variants for testing.
+ """
+ return ModeOfInheritancePredicate.from_moi_info(
+ variant_predicate=variant_predicate,
+ mode_of_inheritance_data=ModeOfInheritanceInfo.x_recessive(),
+ )
+
+ @staticmethod
+ def from_moi_info(
+ variant_predicate: VariantPredicate,
+ mode_of_inheritance_data: ModeOfInheritanceInfo,
+ ) -> "ModeOfInheritancePredicate":
+ """
+ Create a predicate for specified mode of inheritance data.
+ """
+ allele_counter = AlleleCounter(predicate=variant_predicate)
+ return ModeOfInheritancePredicate(
+ allele_counter=allele_counter,
+ mode_of_inheritance_info=mode_of_inheritance_data,
+ )
+
+ def __init__(
+ self,
+ allele_counter: AlleleCounter,
+ mode_of_inheritance_info: ModeOfInheritanceInfo,
+ ):
+ assert isinstance(allele_counter, AlleleCounter)
+ self._allele_counter = allele_counter
+
+ assert isinstance(mode_of_inheritance_info, ModeOfInheritanceInfo)
+ self._moi_info = mode_of_inheritance_info
+
+ self._categorizations = tuple(group.categorization for group in mode_of_inheritance_info.groups)
+ issues = ModeOfInheritancePredicate._check_categorizations(self._categorizations)
+ if issues:
+ raise ValueError('Cannot create predicate: {}'.format(', '.join(issues)))
+ self._question = 'Which genotype group does the patient fit in'
+
+ def get_categorizations(self) -> typing.Sequence[Categorization]:
+ return self._categorizations
+
+ def get_question_base(self) -> str:
+ return self._question
+
+ def test(
+ self,
+ patient: Patient,
+ ) -> typing.Optional[Categorization]:
+ self._check_patient(patient)
+
+ if self._moi_info.is_autosomal():
+ allele_count = self._allele_counter.count(patient)
+ groups = self._moi_info.get_groups_for_allele_count(allele_count)
+ if len(groups) == 1:
+ return groups[0].categorization
+ else:
+ return None
+ elif self._moi_info.is_gonosomal():
+ if patient.sex.is_provided():
+ allele_count = self._allele_counter.count(patient)
+ groups = self._moi_info.get_groups_for_allele_count(allele_count)
+ if len(groups) == 0:
+ # Unable to assign the individual.
+ return None
+ elif len(groups) == 1:
+ # We can only assign into one category no matter what the individual's sex is.
+ return groups[0].categorization
+ else:
+ # We choose depending on the sex.
+ for group in groups:
+ if group.sex is not None and group.sex == patient.sex:
+ return group.categorization
+ return None
+ else:
+ # We must have patient's sex
+ # to do any meaningful analysis
+ # in the non-autosomal scenario.
+ return None
+
+ elif self._moi_info.is_mitochondrial():
+ # Cannot deal with mitochondrial inheritance right now.
+ return None
+ else:
+ # Bug, please report to the developers
+ raise ValueError("Unexpected mode of inheritance condition")
+
+ def __eq__(self, value: object) -> bool:
+ return (
+ isinstance(value, ModeOfInheritancePredicate)
+ and self._allele_counter == value._allele_counter
+ and self._moi_info == value._moi_info
+ )
+
+ def __hash__(self) -> int:
+ return hash((self._allele_counter, self._moi_info,))
+
+ def __str__(self) -> str:
+ return (
+ "ModeOfInheritancePredicate("
+ f"allele_counter={self._allele_counter}, "
+ f"moi_info={self._moi_info})"
+ )
+
+ def __repr__(self) -> str:
+ return str(self)
diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py
index aeaf6e1df..54d397352 100644
--- a/src/gpsea/analysis/predicate/genotype/_variant.py
+++ b/src/gpsea/analysis/predicate/genotype/_variant.py
@@ -313,7 +313,7 @@ def is_structural_deletion(
to determine if the length of the variant is above or below `threshold`.
**IMPORTANT**: the change lengths of deletions are *negative*, since the alternate allele
- is shorter than the reference allele. See the method's documentation for more info.
+ is shorter than the reference allele. See :ref:`change-length-of-an-allele` for more info.
**Example**
diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py
index c2646f727..3b2d9a7b4 100644
--- a/src/gpsea/analysis/predicate/phenotype/_pheno.py
+++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py
@@ -60,11 +60,12 @@ class PhenotypePolyPredicate(
Each phenotype category `P` can be a :class:`~hpotk.model.TermId` representing an HPO term or an OMIM/MONDO term.
- Most of the time, only one category is investigated, and :attr:`phenotype` will return a sequence with
- one element only (e.g. *Arachnodactyly* `HP:0001166`). However, multiple categories can be sought as well,
- such as when the predicate bins the patient into one of discrete diseases
- (e.g. Geleophysic dysplasia, Marfan syndrome, ...). Then the predicate should return a sequence of disease
- identifiers.
+ Only one category can be investigated, and :attr:`phenotype` returns the investigated phenotype
+ (e.g. *Arachnodactyly* `HP:0001166`).
+
+ As another hallmark of this predicate, one of the categorizations must correspond to the group of patients
+ who exibit the investigated phenotype. The categorization is provided
+ via :attr:`present_phenotype_categorization` property.
"""
@property
@@ -75,6 +76,21 @@ def phenotype(self) -> P:
"""
pass
+ @property
+ @abc.abstractmethod
+ def present_phenotype_categorization(self) -> PhenotypeCategorization[P]:
+ """
+ Get the categorization which represents the group of the patients who exibit the investigated phenotype.
+ """
+ pass
+
+ @property
+ def present_phenotype_category(self) -> PatientCategory:
+ """
+ Get the patient category that correspond to the group of the patients who exibit the investigated phenotype.
+ """
+ return self.present_phenotype_categorization.category
+
class PropagatingPhenotypePredicate(PhenotypePolyPredicate[hpotk.TermId]):
"""
@@ -112,18 +128,24 @@ def __init__(
category=PatientCategories.NO,
phenotype=self._query,
)
+ # Some tests depend on the order of `self._categorizations`.
+ self._categorizations = (self._phenotype_observed, self._phenotype_excluded)
- def get_question(self) -> str:
- return f"Is {self._query_label} present in the patient?"
+ def get_question_base(self) -> str:
+ return f"Is {self._query_label} present in the patient"
@property
def phenotype(self) -> hpotk.TermId:
return self._query
+ @property
+ def present_phenotype_categorization(self) -> PhenotypeCategorization[hpotk.TermId]:
+ return self._phenotype_observed
+
def get_categorizations(
self,
) -> typing.Sequence[PhenotypeCategorization[hpotk.TermId]]:
- return self._phenotype_observed, self._phenotype_excluded
+ return self._categorizations
def test(
self, patient: Patient
@@ -200,13 +222,17 @@ def __init__(self, disease_id_query: hpotk.TermId):
phenotype=disease_id_query,
)
- def get_question(self) -> str:
+ def get_question_base(self) -> str:
return f"Was {self._query} diagnosed in the patient"
@property
def phenotype(self) -> hpotk.TermId:
return self._query
+ @property
+ def present_phenotype_categorization(self) -> PhenotypeCategorization[hpotk.TermId]:
+ return self._diagnosis_present
+
def get_categorizations(
self,
) -> typing.Sequence[PhenotypeCategorization[hpotk.TermId]]:
diff --git a/src/gpsea/data/_toy.py b/src/gpsea/data/_toy.py
index e9b80234e..24a3b9c4b 100644
--- a/src/gpsea/data/_toy.py
+++ b/src/gpsea/data/_toy.py
@@ -3,7 +3,7 @@
from gpsea.model import *
from gpsea.model.genome import Contig, GenomicRegion, Strand
-CONTIG = Contig('1', 'GB_ACC', 'REFSEQ_NAME', 'UCSC_NAME', 1_000)
+CONTIG = Contig("1", "GB_ACC", "REFSEQ_NAME", "UCSC_NAME", 1_000)
def make_region(start: int, end: int) -> GenomicRegion:
@@ -22,189 +22,293 @@ def get_toy_cohort() -> Cohort:
# - Spasticity HP:0001257
# - Chronic pancreatitis HP:0006280
- arachnodactyly_T = Phenotype(TermId.from_curie('HP:0001166'), True)
- focal_clonic_seizure_T = Phenotype(TermId.from_curie('HP:0002266'), True)
- seizure_T = Phenotype(TermId.from_curie('HP:0001250'), True)
- spasticity_T = Phenotype(TermId.from_curie('HP:0001257'), True)
- arachnodactyly_F = Phenotype(TermId.from_curie('HP:0001166'), False)
- focal_clonic_seizure_F = Phenotype(TermId.from_curie('HP:0002266'), False)
- seizure_F = Phenotype(TermId.from_curie('HP:0001250'), False)
- spasticity_F = Phenotype(TermId.from_curie('HP:0001257'), False)
-
- Disease_T = Disease(TermId.from_curie('OMIM:001234'), "Test present disease", True)
- Disease_F = Disease(TermId.from_curie('OMIM:001234'), "Test absent disease", False)
+ arachnodactyly_T = Phenotype(TermId.from_curie("HP:0001166"), True)
+ focal_clonic_seizure_T = Phenotype(TermId.from_curie("HP:0002266"), True)
+ seizure_T = Phenotype(TermId.from_curie("HP:0001250"), True)
+ spasticity_T = Phenotype(TermId.from_curie("HP:0001257"), True)
+ arachnodactyly_F = Phenotype(TermId.from_curie("HP:0001166"), False)
+ focal_clonic_seizure_F = Phenotype(TermId.from_curie("HP:0002266"), False)
+ seizure_F = Phenotype(TermId.from_curie("HP:0001250"), False)
+ spasticity_F = Phenotype(TermId.from_curie("HP:0001257"), False)
- snv = Variant.create_variant_from_scratch(VariantCoordinates(make_region(280, 281), 'A', 'G', 0), 'FakeGene',
- 'NM_1234.5', 'NM_1234.5:c.180A>G', False, [VariantEffect.MISSENSE_VARIANT], [1],
- 'NP_09876.5', 'NP_09876.5:p.Unk200', 60, 60,
- Genotypes.from_mapping({
- SampleLabels('A'): Genotype.HETEROZYGOUS, SampleLabels('B'): Genotype.HETEROZYGOUS,
- SampleLabels('C'): Genotype.HOMOZYGOUS_ALTERNATE,
- SampleLabels('D'): Genotype.HETEROZYGOUS, SampleLabels('E'): Genotype.HETEROZYGOUS,
- SampleLabels('G'): Genotype.HETEROZYGOUS, SampleLabels('J'): Genotype.HETEROZYGOUS,
- SampleLabels('K'): Genotype.HETEROZYGOUS, SampleLabels('M'): Genotype.HETEROZYGOUS,
- SampleLabels('N'): Genotype.HOMOZYGOUS_ALTERNATE,
- SampleLabels('P'): Genotype.HETEROZYGOUS,
- SampleLabels('Q'): Genotype.HOMOZYGOUS_ALTERNATE,
- SampleLabels('R'): Genotype.HETEROZYGOUS, SampleLabels('T'): Genotype.HETEROZYGOUS,
- SampleLabels('V'): Genotype.HETEROZYGOUS, SampleLabels('Y'): Genotype.HETEROZYGOUS,
- })
- )
- deletion = Variant.create_variant_from_scratch(VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2),
- 'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del',
- False, [VariantEffect.FRAMESHIFT_VARIANT],
- [2], 'NP_09876.5','NP_09876.5:p.Unk200', 86, 87,
- Genotypes.from_mapping({
- SampleLabels('D'): Genotype.HETEROZYGOUS, SampleLabels('F'): Genotype.HETEROZYGOUS,
- SampleLabels('G'): Genotype.HETEROZYGOUS, SampleLabels('H'): Genotype.HETEROZYGOUS,
- SampleLabels('I'): Genotype.HOMOZYGOUS_ALTERNATE,
- SampleLabels('L'): Genotype.HETEROZYGOUS, SampleLabels('O'): Genotype.HETEROZYGOUS,
- SampleLabels('R'): Genotype.HETEROZYGOUS,
- SampleLabels('S'): Genotype.HOMOZYGOUS_ALTERNATE,
- SampleLabels('U'): Genotype.HETEROZYGOUS, SampleLabels('W'): Genotype.HETEROZYGOUS,
- SampleLabels('X'): Genotype.HETEROZYGOUS, SampleLabels('Z'): Genotype.HETEROZYGOUS,
- })
- )
+ Disease_T = Disease(TermId.from_curie("OMIM:001234"), "Test present disease", True)
+ Disease_F = Disease(TermId.from_curie("OMIM:001234"), "Test absent disease", False)
+
+ snv = Variant.create_variant_from_scratch(
+ VariantCoordinates(make_region(280, 281), "A", "G", 0),
+ "FakeGene",
+ "NM_1234.5",
+ "NM_1234.5:c.180A>G",
+ False,
+ [VariantEffect.MISSENSE_VARIANT],
+ [1],
+ "NP_09876.5",
+ "NP_09876.5:p.Unk200",
+ 60,
+ 60,
+ Genotypes.from_mapping(
+ {
+ SampleLabels("A"): Genotype.HETEROZYGOUS,
+ SampleLabels("B"): Genotype.HETEROZYGOUS,
+ SampleLabels("C"): Genotype.HOMOZYGOUS_ALTERNATE,
+ SampleLabels("D"): Genotype.HETEROZYGOUS,
+ SampleLabels("E"): Genotype.HETEROZYGOUS,
+ SampleLabels("G"): Genotype.HETEROZYGOUS,
+ SampleLabels("J"): Genotype.HETEROZYGOUS,
+ SampleLabels("K"): Genotype.HETEROZYGOUS,
+ SampleLabels("M"): Genotype.HETEROZYGOUS,
+ SampleLabels("N"): Genotype.HOMOZYGOUS_ALTERNATE,
+ SampleLabels("P"): Genotype.HETEROZYGOUS,
+ SampleLabels("Q"): Genotype.HOMOZYGOUS_ALTERNATE,
+ SampleLabels("R"): Genotype.HETEROZYGOUS,
+ SampleLabels("T"): Genotype.HETEROZYGOUS,
+ SampleLabels("V"): Genotype.HETEROZYGOUS,
+ SampleLabels("Y"): Genotype.HETEROZYGOUS,
+ }
+ ),
+ )
+ deletion = Variant.create_variant_from_scratch(
+ VariantCoordinates(make_region(360, 363), "TTC", "T", -2),
+ "FakeGene",
+ "NM_1234.5",
+ "NM_1234.5:c.261_263del",
+ False,
+ [VariantEffect.FRAMESHIFT_VARIANT],
+ [2],
+ "NP_09876.5",
+ "NP_09876.5:p.Unk200",
+ 86,
+ 87,
+ Genotypes.from_mapping(
+ {
+ SampleLabels("D"): Genotype.HETEROZYGOUS,
+ SampleLabels("F"): Genotype.HETEROZYGOUS,
+ SampleLabels("G"): Genotype.HETEROZYGOUS,
+ SampleLabels("H"): Genotype.HETEROZYGOUS,
+ SampleLabels("I"): Genotype.HOMOZYGOUS_ALTERNATE,
+ SampleLabels("L"): Genotype.HETEROZYGOUS,
+ SampleLabels("O"): Genotype.HETEROZYGOUS,
+ SampleLabels("R"): Genotype.HETEROZYGOUS,
+ SampleLabels("S"): Genotype.HOMOZYGOUS_ALTERNATE,
+ SampleLabels("U"): Genotype.HETEROZYGOUS,
+ SampleLabels("W"): Genotype.HETEROZYGOUS,
+ SampleLabels("X"): Genotype.HETEROZYGOUS,
+ SampleLabels("Z"): Genotype.HETEROZYGOUS,
+ }
+ ),
+ )
het_dup = Variant.create_variant_from_scratch(
- VariantCoordinates(make_region(175, 176), 'T', 'TG', 1), 'FakeGene', 'NM_1234.5',
- 'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], 'NP_09876.5', 'NP_09876.5:p.Unk200', 25, 25,
- Genotypes.empty()) # Not used in the patients below, hence `empty()`.
+ VariantCoordinates(make_region(175, 176), "T", "TG", 1),
+ "FakeGene",
+ "NM_1234.5",
+ "NM_1234.5:c.75A>G",
+ False,
+ [VariantEffect.FRAMESHIFT_VARIANT],
+ [1],
+ "NP_09876.5",
+ "NP_09876.5:p.Unk200",
+ 25,
+ 25,
+ Genotypes.empty(),
+ ) # Not used in the patients below, hence `empty()`.
hom_dup = Variant.create_variant_from_scratch(
- VariantCoordinates(make_region(175, 176), 'T', 'TG', 1),'FakeGene', 'NM_1234.5',
- 'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], 'NP_09876.5', 'NP_09876.5:p.Unk200', 25, 25,
- Genotypes.empty()) # Not used in the patients below, hence `empty()`.
+ VariantCoordinates(make_region(175, 176), "T", "TG", 1),
+ "FakeGene",
+ "NM_1234.5",
+ "NM_1234.5:c.75A>G",
+ False,
+ [VariantEffect.FRAMESHIFT_VARIANT],
+ [1],
+ "NP_09876.5",
+ "NP_09876.5:p.Unk200",
+ 25,
+ 25,
+ Genotypes.empty(),
+ ) # Not used in the patients below, hence `empty()`.
patients = (
- Patient(SampleLabels('A'),
- phenotypes=(arachnodactyly_T, spasticity_F, seizure_T),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('B'),
- phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('C'),
- phenotypes=(arachnodactyly_F, spasticity_T, seizure_T),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('D'),
- phenotypes=(arachnodactyly_T, spasticity_T, seizure_T),
- variants=[snv, deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('E'),
- phenotypes=(arachnodactyly_T, spasticity_T, seizure_F),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('F'),
- phenotypes=(arachnodactyly_F, spasticity_F, seizure_T),
- variants=[deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('G'),
- phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
- variants=[snv, deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('H'),
- phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
- variants=[deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('I'),
- phenotypes=(arachnodactyly_F, spasticity_F, seizure_T),
- variants=[deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('J'),
- phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('K'),
- phenotypes=(arachnodactyly_F, spasticity_T, seizure_T),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('L'),
- phenotypes=(arachnodactyly_F, seizure_F, spasticity_F),
- variants=[deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('M'),
- phenotypes=(arachnodactyly_T, seizure_F, spasticity_T),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('N'),
- phenotypes=(arachnodactyly_F, seizure_T, spasticity_F),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('O'),
- phenotypes=(arachnodactyly_F, seizure_F, spasticity_T),
- variants=[deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('P'),
- phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('Q'),
- phenotypes=(arachnodactyly_T, seizure_F, spasticity_F),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('R'),
- phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
- variants=[snv, deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('S'),
- phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
- variants=[deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('T'),
- phenotypes=(arachnodactyly_T, seizure_F, spasticity_T),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('U'),
- phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
- variants=[deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('V'),
- phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('W'),
- phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
- variants=[deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('X'),
- phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
- variants=[deletion],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('Y'),
- phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
- variants=[snv],
- diseases=[Disease_T]
- ),
- Patient(SampleLabels('Z'),
- phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
- variants=[deletion],
- diseases=[Disease_T]
- ),
+ Patient.from_raw_parts(
+ SampleLabels("A"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, spasticity_F, seizure_T),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("B"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("C"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, spasticity_T, seizure_T),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("D"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, spasticity_T, seizure_T),
+ variants=[snv, deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("E"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, spasticity_T, seizure_F),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("F"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, spasticity_F, seizure_T),
+ variants=[deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("G"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
+ variants=[snv, deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("H"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
+ variants=[deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("I"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, spasticity_F, seizure_T),
+ variants=[deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("J"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("K"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, spasticity_T, seizure_T),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("L"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, seizure_F, spasticity_F),
+ variants=[deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("M"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_F, spasticity_T),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("N"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, seizure_T, spasticity_F),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("O"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, seizure_F, spasticity_T),
+ variants=[deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("P"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("Q"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_F, spasticity_F),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("R"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_T, spasticity_F),
+ variants=[snv, deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("S"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
+ variants=[deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("T"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_F, spasticity_T),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("U"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
+ variants=[deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("V"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("W"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
+ variants=[deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("X"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
+ variants=[deletion],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("Y"),
+ sex=None,
+ phenotypes=(arachnodactyly_T, seizure_T, spasticity_T),
+ variants=[snv],
+ diseases=[Disease_T],
+ ),
+ Patient.from_raw_parts(
+ SampleLabels("Z"),
+ sex=None,
+ phenotypes=(arachnodactyly_F, seizure_T, spasticity_T),
+ variants=[deletion],
+ diseases=[Disease_T],
+ ),
)
return Cohort.from_patients(patients)
diff --git a/src/gpsea/io.py b/src/gpsea/io.py
index a9b300784..c5b3aea18 100644
--- a/src/gpsea/io.py
+++ b/src/gpsea/io.py
@@ -23,7 +23,7 @@ def default(self, o):
'tx_annotations': o.tx_annotations,
'genotypes': o.genotypes,
}
- elif isinstance(o, VariantInfo):
+ elif isinstance(o, VariantInfo):
return {
'variant_coordinates': o.variant_coordinates,
'sv_info': o.sv_info,
@@ -87,7 +87,7 @@ def default(self, o):
'label': o.label,
'meta_label': o.meta_label,
}
- elif isinstance(o, (Genotype, VariantEffect, Strand, VariantClass)):
+ elif isinstance(o, (Sex, Genotype, VariantEffect, Strand, VariantClass)):
# enums
return o.name
elif isinstance(o, Phenotype):
@@ -104,6 +104,7 @@ def default(self, o):
elif isinstance(o, Patient):
return {
'labels': o.labels,
+ 'sex': o.sex,
'phenotypes': o.phenotypes,
'diseases': o.diseases,
'variants': o.variants,
@@ -161,7 +162,7 @@ def default(self, o):
_FEATURE_INFO = ('name', 'region')
_PHENOTYPE_FIELDS = ('term_id', 'is_present')
_DISEASE_FIELDS = ('term_id', 'name', 'is_observed')
-_PATIENT_FIELDS = ('labels', 'phenotypes', 'diseases', 'variants')
+_PATIENT_FIELDS = ('labels', 'sex', 'phenotypes', 'diseases', 'variants')
_COHORT_FIELDS = ('members', 'excluded_patient_count')
@@ -281,6 +282,7 @@ def object_hook(obj: typing.Dict[typing.Any, typing.Any]) -> typing.Any:
elif GpseaJSONDecoder._has_all_fields(obj, _PATIENT_FIELDS):
return Patient(
labels=obj['labels'],
+ sex=Sex[obj['sex']],
phenotypes=obj['phenotypes'],
diseases=obj['diseases'],
variants=obj['variants'],
diff --git a/src/gpsea/model/__init__.py b/src/gpsea/model/__init__.py
index 2530b4987..1d17138c5 100644
--- a/src/gpsea/model/__init__.py
+++ b/src/gpsea/model/__init__.py
@@ -5,19 +5,21 @@
and we follow with data classes for phenotype, genotype, transcript, and protein info.
"""
-from ._base import SampleLabels
+from ._base import SampleLabels, Sex
from ._cohort import Cohort, Patient
from ._gt import Genotype, Genotypes, Genotyped
from ._phenotype import Phenotype, Disease
from ._protein import FeatureInfo, FeatureType, ProteinFeature, ProteinMetadata
from ._tx import TranscriptCoordinates
-from ._variant import VariantCoordinates, ImpreciseSvInfo, VariantInfo, VariantInfoAware, TranscriptAnnotation, TranscriptInfoAware, VariantClass, Variant
+from ._variant import VariantCoordinates, ImpreciseSvInfo, VariantInfo, VariantInfoAware, VariantClass, Variant
+from ._variant import TranscriptAnnotation, TranscriptInfoAware
from ._variant_effects import VariantEffect
__all__ = [
- 'Cohort', 'Patient', 'SampleLabels',
+ 'Cohort', 'Patient', 'SampleLabels', 'Sex',
'Phenotype', 'Disease',
- 'Variant', 'VariantClass', 'VariantCoordinates', 'ImpreciseSvInfo', 'VariantInfo', 'VariantInfoAware', 'Genotype', 'Genotypes', 'Genotyped',
+ 'Variant', 'VariantClass', 'VariantCoordinates', 'ImpreciseSvInfo', 'VariantInfo', 'VariantInfoAware',
+ 'Genotype', 'Genotypes', 'Genotyped',
'TranscriptAnnotation', 'VariantEffect', 'TranscriptInfoAware', 'TranscriptCoordinates',
'ProteinMetadata', 'ProteinFeature', 'FeatureInfo', 'FeatureType',
]
diff --git a/src/gpsea/model/_base.py b/src/gpsea/model/_base.py
index 36716831e..09a1c913d 100644
--- a/src/gpsea/model/_base.py
+++ b/src/gpsea/model/_base.py
@@ -1,8 +1,56 @@
+import enum
import typing
import hpotk
+class Sex(enum.Enum):
+ """
+ `Sex` represents typical “phenotypic sex”, as would be determined by a midwife or physician at birth.
+
+ The definition is aligned with `Phenopacket Schema `_
+ """
+
+ UNKNOWN_SEX = 0
+ """
+ Not assessed or not available. Maps to ``NCIT:C17998``.
+ """
+
+ FEMALE = 1
+ """
+ Female sex. Maps to ``NCIT:C46113``.
+ """
+
+ MALE = 2
+ """
+ Male sex. Maps to ``NCIT:C46112``.
+ """
+
+ def is_provided(self) -> bool:
+ """
+ Return `True` if the sex is a known value, such as `FEMALE` or `MALE`.
+ """
+ return self != Sex.UNKNOWN_SEX
+
+ def is_unknown(self) -> bool:
+ """
+ Return `True` if this is an `UNKNOWN_SEX`.
+ """
+ return self == Sex.UNKNOWN_SEX
+
+ def is_female(self) -> bool:
+ """
+ Return `True` if the sex represents a `FEMALE`.
+ """
+ return self == Sex.MALE
+
+ def is_male(self) -> bool:
+ """
+ Return `True` if the sex represents a `MALE`.
+ """
+ return self == Sex.MALE
+
+
class SampleLabels:
"""
A data model for subject identifiers.
diff --git a/src/gpsea/model/_cohort.py b/src/gpsea/model/_cohort.py
index 3209748e8..03fcf08e4 100644
--- a/src/gpsea/model/_cohort.py
+++ b/src/gpsea/model/_cohort.py
@@ -5,7 +5,7 @@
import hpotk
-from ._base import SampleLabels
+from ._base import SampleLabels, Sex
from ._phenotype import Phenotype, Disease
from ._variant import Variant
@@ -13,16 +13,49 @@
class Patient:
"""
`Patient` represents a single investigated individual.
+
+ .. note::
+
+ We strongly recommend using the :func:`from_raw_parts` static constructor
+ instead of `__init__`.
"""
+ @staticmethod
+ def from_raw_parts(
+ labels: SampleLabels,
+ sex: typing.Optional[Sex],
+ phenotypes: typing.Iterable[Phenotype],
+ diseases: typing.Iterable[Disease],
+ variants: typing.Iterable[Variant]
+ ) -> "Patient":
+ """
+ Create `Patient` from the primary data.
+ """
+ if sex is None:
+ sex = Sex.UNKNOWN_SEX
+
+ return Patient(
+ labels=labels,
+ sex=sex,
+ phenotypes=phenotypes,
+ diseases=diseases,
+ variants=variants,
+ )
+
def __init__(
self,
labels: SampleLabels,
+ sex: Sex,
phenotypes: typing.Iterable[Phenotype],
diseases: typing.Iterable[Disease],
variants: typing.Iterable[Variant]
):
+ assert isinstance(labels, SampleLabels)
self._labels = labels
+
+ assert isinstance(sex, Sex)
+ self._sex = sex
+
self._phenotypes = tuple(phenotypes)
self._diseases = tuple(diseases)
self._variants = tuple(variants)
@@ -41,6 +74,13 @@ def labels(self) -> SampleLabels:
"""
return self._labels
+ @property
+ def sex(self) -> Sex:
+ """
+ Get the "phenotype sex" of the sample.
+ """
+ return self._sex
+
@property
def phenotypes(self) -> typing.Sequence[Phenotype]:
"""
@@ -89,6 +129,7 @@ def excluded_diseases(self) -> typing.Iterator[Disease]:
def __str__(self) -> str:
return (f"Patient("
f"labels:{self._labels}, "
+ f"sex:{self._sex}, "
f"variants:{self.variants}, "
f"phenotypes:{[pheno.identifier for pheno in self.phenotypes]}, "
f"diseases:{[dis.identifier for dis in self.diseases]}")
@@ -99,12 +140,13 @@ def __repr__(self) -> str:
def __eq__(self, other) -> bool:
return (isinstance(other, Patient)
and self._labels == other._labels
+ and self._sex == other._sex
and self._variants == other._variants
and self._phenotypes == other._phenotypes
and self._diseases == other._diseases)
def __hash__(self) -> int:
- return hash((self._labels, self._variants, self._phenotypes, self._diseases))
+ return hash((self._labels, self._sex, self._variants, self._phenotypes, self._diseases))
class Cohort(typing.Sized, typing.Iterable[Patient]):
@@ -201,7 +243,7 @@ def list_present_phenotypes(
Otherwise, lists only the `top` highest counts
Returns:
- typing.Sequence[typing.Tuple[str, int]]: A sequence of tuples, formatted (phenotype CURIE,
+ typing.Sequence[typing.Tuple[str, int]]: A sequence of tuples, formatted (phenotype CURIE,
number of patients with that phenotype)
"""
counter = Counter()
diff --git a/src/gpsea/preprocessing/__init__.py b/src/gpsea/preprocessing/__init__.py
index b4367167b..1551b4d63 100644
--- a/src/gpsea/preprocessing/__init__.py
+++ b/src/gpsea/preprocessing/__init__.py
@@ -4,7 +4,7 @@
from ._config import configure_caching_patient_creator, configure_patient_creator
from ._config import load_phenopacket_folder, load_phenopackets
from ._config import configure_caching_cohort_creator, configure_cohort_creator
-from ._config import configure_protein_metadata_service
+from ._config import configure_default_protein_metadata_service, configure_protein_metadata_service
from ._generic import DefaultImpreciseSvFunctionalAnnotator
from ._patient import PatientCreator, CohortCreator
from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator
@@ -18,7 +18,7 @@
__all__ = [
'configure_caching_patient_creator', 'configure_patient_creator',
'configure_caching_cohort_creator', 'configure_cohort_creator',
- 'configure_protein_metadata_service',
+ 'configure_default_protein_metadata_service', 'configure_protein_metadata_service',
'VariantCoordinateFinder', 'FunctionalAnnotator', 'ImpreciseSvFunctionalAnnotator', 'ProteinMetadataService',
'PatientCreator', 'CohortCreator',
'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'load_phenopacket_folder', 'load_phenopackets',
diff --git a/src/gpsea/preprocessing/_audit.py b/src/gpsea/preprocessing/_audit.py
index f4ab6597d..3bdafe97e 100644
--- a/src/gpsea/preprocessing/_audit.py
+++ b/src/gpsea/preprocessing/_audit.py
@@ -165,7 +165,7 @@ def __init__(self, label: str):
@abc.abstractmethod
- def add_subsection(self, label: str):
+ def add_subsection(self, label: str) -> "Notepad":
"""
Add a labeled subsection.
diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py
index 67c14eab3..348bd5b1c 100644
--- a/src/gpsea/preprocessing/_config.py
+++ b/src/gpsea/preprocessing/_config.py
@@ -225,14 +225,65 @@ def configure_protein_metadata_service(
In any case, the directory will be created if it does not exist (including any non-existing parents).
:param timeout: timeout in seconds for the REST APIs.
"""
+ warnings.warn('Use `configure_default_protein_metadata_service` instead', DeprecationWarning, stacklevel=2)
+ return configure_default_protein_metadata_service(
+ cache_dir=cache_dir,
+ timeout=timeout,
+ )
+
+
+def configure_default_protein_metadata_service(
+ protein_source: typing.Literal['UNIPROT'] = 'UNIPROT',
+ cache_dir: typing.Optional[str] = None,
+ timeout: float = 30.,
+) -> ProteinMetadataService:
+ """
+ Create default protein metadata service that will cache the protein metadata
+ in current working directory under `.gpsea_cache/protein_cache`
+ and reach out to UNIPROT REST API if a cache entry is missing.
+
+ :param protein_source: a `str` with the code of the protein data sources (currently accepting just `UNIPROT`).
+ :param cache_dir: path to the folder where we will cache the results fetched from the remote APIs or `None`
+ if the data should be cached as described by :func:`~gpsea.config.get_cache_dir_path` function.
+ In any case, the directory will be created if it does not exist (including any non-existing parents).
+ :param timeout: timeout in seconds for the REST APIs.
+ """
cache_dir = _configure_cache_dir(cache_dir)
+ return _configure_protein_service(
+ protein_fallback=protein_source,
+ cache_dir=cache_dir,
+ timeout=timeout,
+ )
- protein_cache_dir = os.path.join(cache_dir, "protein_cache")
- os.makedirs(protein_cache_dir, exist_ok=True)
- cache = ProteinAnnotationCache(protein_cache_dir)
- fallback = UniprotProteinMetadataService(timeout=timeout)
- return ProtCachingMetadataService(cache=cache, fallback=fallback)
+def _configure_protein_service(
+ protein_fallback: str,
+ cache_dir: str,
+ timeout: float,
+) -> ProteinMetadataService:
+ # (1) ProteinMetadataService
+ # Setup fallback
+ protein_fallback = _configure_fallback_protein_service(
+ protein_fallback,
+ timeout,
+ )
+ # Setup protein metadata cache
+ prot_cache_dir = os.path.join(cache_dir, 'protein_cache')
+ os.makedirs(prot_cache_dir, exist_ok=True)
+ prot_cache = ProteinAnnotationCache(prot_cache_dir)
+ # Assemble the final protein metadata service
+ protein_metadata_service = ProtCachingMetadataService(prot_cache, protein_fallback)
+ return protein_metadata_service
+
+
+def _configure_fallback_protein_service(
+ protein_fallback: str,
+ timeout: float,
+) -> ProteinMetadataService:
+ if protein_fallback == 'UNIPROT':
+ return UniprotProteinMetadataService(timeout)
+ else:
+ raise ValueError(f'Unknown protein fallback annotator type {protein_fallback}')
def _configure_cache_dir(
diff --git a/src/gpsea/preprocessing/_phenopacket.py b/src/gpsea/preprocessing/_phenopacket.py
index abe83fbcc..136825bd7 100644
--- a/src/gpsea/preprocessing/_phenopacket.py
+++ b/src/gpsea/preprocessing/_phenopacket.py
@@ -4,15 +4,28 @@
import hpotk
from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket
+import phenopackets.schema.v2.core.individual_pb2 as ppi
from phenopackets.schema.v2.core.disease_pb2 import Disease as PPDisease
from phenopackets.schema.v2.core.interpretation_pb2 import GenomicInterpretation
from phenopackets.vrsatile.v1.vrsatile_pb2 import VcfRecord, VariationDescriptor
from phenopackets.vrs.v1.vrs_pb2 import Variation
-from gpsea.model import Patient, SampleLabels, Disease
-from gpsea.model import VariantClass, VariantCoordinates, ImpreciseSvInfo, VariantInfo, Variant, Genotype, Genotypes
+from gpsea.model import SampleLabels, Patient, Sex, Disease
+from gpsea.model import (
+ VariantClass,
+ VariantCoordinates,
+ ImpreciseSvInfo,
+ VariantInfo,
+ Variant,
+ Genotype,
+ Genotypes,
+)
from gpsea.model.genome import GenomeBuild, GenomicRegion, Strand
-from ._api import VariantCoordinateFinder, FunctionalAnnotator, ImpreciseSvFunctionalAnnotator
+from ._api import (
+ VariantCoordinateFinder,
+ FunctionalAnnotator,
+ ImpreciseSvFunctionalAnnotator,
+)
from ._audit import Notepad
from ._patient import PatientCreator
from ._phenotype import PhenotypeCreator
@@ -27,14 +40,14 @@ def find_genotype(
self,
item: GenomicInterpretation,
) -> typing.Optional[Genotype]:
- if item.HasField('variant_interpretation'):
+ if item.HasField("variant_interpretation"):
variant_interpretation = item.variant_interpretation
- if variant_interpretation.HasField('variation_descriptor'):
+ if variant_interpretation.HasField("variation_descriptor"):
variation_descriptor = variant_interpretation.variation_descriptor
- if variation_descriptor.HasField('allelic_state'):
+ if variation_descriptor.HasField("allelic_state"):
genotype = variation_descriptor.allelic_state.label
return self._map_geno_genotype_label(genotype)
-
+
return None
@staticmethod
@@ -85,7 +98,7 @@ def find_coordinates(
Args:
item (GenomicInterpretation): a genomic interpretation element from Phenopacket Schema
-
+
Returns:
typing.Optional[VariantCoordinates]: variant coordinates
"""
@@ -102,8 +115,9 @@ def find_coordinates(
variation_descriptor.vcf_record.genome_assembly
):
raise ValueError(
- f"Variant id {variation_descriptor.id} for patient {item.subject_or_biosample_id} has a different Genome Assembly than what was given. "
- + f"{variation_descriptor.vcf_record.genome_assembly} is not {self._build.identifier}."
+ f"Variant id {variation_descriptor.id} for patient {item.subject_or_biosample_id} "
+ "has a different Genome Assembly than what was given. "
+ f"{variation_descriptor.vcf_record.genome_assembly} is not {self._build.identifier}."
)
contig = self._build.contig_by_name(variation_descriptor.vcf_record.chrom)
start = int(variation_descriptor.vcf_record.pos) - 1
@@ -134,7 +148,9 @@ def find_coordinates(
alt = ""
change_length = end - start
else:
- raise ValueError(f"The copy number of {number} is not supported. Supported values: {{1, 3}}")
+ raise ValueError(
+ f"The copy number of {number} is not supported. Supported values: {{1, 3}}"
+ )
region = GenomicRegion(contig, start, end, Strand.POSITIVE)
return VariantCoordinates(region, ref, alt, change_length)
@@ -197,7 +213,7 @@ def _looks_like_large_sv(
else None
)
- # If we have these fields, we seem to have all information
+ # If we have these fields, we seem to have all information
# to parse the variation descriptor elsewhere.
return structural_type is not None and gene_context is not None
@@ -214,7 +230,19 @@ def __init__(
functional_annotator: FunctionalAnnotator,
imprecise_sv_functional_annotator: ImpreciseSvFunctionalAnnotator,
hgvs_coordinate_finder: VariantCoordinateFinder[str],
+ assume_karyotypic_sex: bool = True,
):
+ """
+ Create an instance.
+
+ :param build: the genome build to use for variants.
+ :param phenotype_creator: a phenotype creator for creating phenotypes.
+ :param functional_annotator: for computing functional annotations.
+ :param imprecise_sv_functional_annotator: for getting info about imprecise variants.
+ :param hgvs_coordinate_finder: for finding chromosomal coordinates for HGVS variant descriptions.
+ :param assume_karyotypic_sex: `True` if it is OK to assume that `FEMALE` has the `XX` karyotype
+ and `MALE` has `XY`.
+ """
self._logger = logging.getLogger(__name__)
# Violates DI, but it is specific to this class, so I'll leave it "as is".
self._coord_finder = PhenopacketVariantCoordinateFinder(
@@ -228,49 +256,64 @@ def __init__(
functional_annotator, FunctionalAnnotator, "functional_annotator"
)
self._imprecise_sv_functional_annotator = hpotk.util.validate_instance(
- imprecise_sv_functional_annotator, ImpreciseSvFunctionalAnnotator, "imprecise_sv_functional_annotator"
+ imprecise_sv_functional_annotator,
+ ImpreciseSvFunctionalAnnotator,
+ "imprecise_sv_functional_annotator",
)
-
+
# Set of sequence ontology IDs that we will treat as a deletion (`DEL`)
# for the purpose of assigning imprecise SV info with a variant class.
self._so_deletions = {
- '1000029', # chromosomal deletion: An incomplete chromosome.
- '0001893', # transcript ablation: A feature ablation whereby the deleted region includes a transcript feature.
- '0001879', # feature_ablation: A sequence variant, caused by an alteration of the genomic sequence, where the deletion, is greater than the extent of the underlying genomic features.
+ "1000029", # chromosomal deletion: An incomplete chromosome.
+ # transcript ablation: A feature ablation whereby the deleted region includes a transcript feature.
+ "0001893",
+ # feature_ablation: A sequence variant, caused by an alteration of the genomic sequence,
+ # where the deletion, is greater than the extent of the underlying genomic features.
+ "0001879",
}
self._so_duplications = {
- '1000037', # chromosomal_duplication
+ "1000037", # chromosomal_duplication
}
+ self._assume_karyotypic_sex = assume_karyotypic_sex
- def process(self, inputs: Phenopacket, notepad: Notepad) -> Patient:
+ def process(self, pp: Phenopacket, notepad: Notepad) -> Patient:
"""Creates a Patient from the data in a given Phenopacket
Args:
- inputs (Phenopacket): A Phenopacket object
+ pp (Phenopacket): A Phenopacket object
notepad (Notepad): notepad to write down the issues
Returns:
Patient: A Patient object
"""
sample_id = SampleLabels(
- label=inputs.subject.id,
- meta_label=inputs.id if len(inputs.id) > 0 else None,
+ label=pp.subject.id,
+ meta_label=pp.id if len(pp.id) > 0 else None,
)
+ # Extract karyotypic sex
+ indi = notepad.add_subsection("individual")
+ sex = self._extract_sex(pp.subject, indi)
+
# Check phenotypes
pfs = notepad.add_subsection("phenotype-features")
phenotypes = self._phenotype_creator.process(
- ((pf.type.id, not pf.excluded) for pf in inputs.phenotypic_features), pfs
+ ((pf.type.id, not pf.excluded) for pf in pp.phenotypic_features), pfs
)
# Check diseases
- diseases = self._add_diseases([dis for dis in inputs.diseases], pfs)
+ dip = notepad.add_subsection("diseases")
+ diseases = self._add_diseases(pp.diseases, dip)
# Check variants
vs = notepad.add_subsection("variants")
- variants = self._add_variants(sample_id, inputs, vs)
-
- return Patient(
- sample_id, phenotypes=phenotypes, variants=variants, diseases=diseases
+ variants = self._add_variants(sample_id, pp, vs)
+
+ return Patient.from_raw_parts(
+ sample_id,
+ sex=sex,
+ phenotypes=phenotypes,
+ variants=variants,
+ diseases=diseases,
)
def _add_diseases(
@@ -285,21 +328,43 @@ def _add_diseases(
Sequence[Dis]: A list of Disease objects
"""
if len(diseases) == 0:
- notepad.add_warning(f"No diseases found.")
- return []
+ notepad.add_warning("No diseases found")
+ return ()
+
final_diseases = []
for i, dis in enumerate(diseases):
if not dis.HasField("term"):
- raise ValueError("Could not find term in Disease.")
- term_id = hpotk.TermId.from_curie(dis.term.id)
+ notepad.add_error(f"#{i} has no `term`")
+ continue
+ else:
+ term_id = hpotk.TermId.from_curie(dis.term.id)
+
# Do not include excluded diseases if we decide to assume excluded if not included
final_diseases.append(Disease(term_id, dis.term.label, not dis.excluded))
+
return final_diseases
+ def _extract_sex(
+ self,
+ individual: ppi.Individual,
+ notepad: Notepad,
+ ) -> typing.Optional[Sex]:
+ # Let's use the phenotypic sex as fallback
+ sex = individual.sex
+ if sex == ppi.FEMALE:
+ return Sex.FEMALE
+ elif sex == ppi.MALE:
+ return Sex.MALE
+ elif sex == ppi.OTHER_SEX or sex == ppi.UNKNOWN_SEX:
+ return Sex.UNKNOWN_SEX
+ else:
+ notepad.add_warning(f'Unknown sex type: {sex}')
+ return Sex.UNKNOWN_SEX
+
def _add_variants(
- self,
- sample_id: SampleLabels,
- pp: Phenopacket,
+ self,
+ sample_id: SampleLabels,
+ pp: Phenopacket,
notepad: Notepad,
) -> typing.Sequence[Variant]:
"""Creates a list of Variant objects from the data in a given Phenopacket
@@ -311,24 +376,28 @@ def _add_variants(
Sequence[Variant]: A list of Variant objects
"""
variants = []
-
+
for i, interpretation in enumerate(pp.interpretations):
- sub_note = notepad.add_subsection(f'#{i}')
- if interpretation.HasField('diagnosis'):
- for genomic_interpretation in interpretation.diagnosis.genomic_interpretations:
+ sub_note = notepad.add_subsection(f"#{i}")
+ if interpretation.HasField("diagnosis"):
+ for (
+ genomic_interpretation
+ ) in interpretation.diagnosis.genomic_interpretations:
gt = self._gt_parser.find_genotype(genomic_interpretation)
if gt is None:
sub_note.add_warning(
- f"Could not extract genotype from genomic interpretation",
+ "Could not extract genotype from genomic interpretation",
"Remove variant from testing",
)
continue
-
- variant_info = self._extract_variant_info(sample_id, genomic_interpretation, sub_note)
+
+ variant_info = self._extract_variant_info(
+ sample_id, genomic_interpretation, sub_note
+ )
if variant_info is None:
# We already complained in the extract function
continue
-
+
if variant_info.has_variant_coordinates():
try:
tx_annotations = self._functional_annotator.annotate(
@@ -342,8 +411,10 @@ def _add_variants(
continue
elif variant_info.has_sv_info():
try:
- tx_annotations = self._imprecise_sv_functional_annotator.annotate(
- item=variant_info.sv_info,
+ tx_annotations = (
+ self._imprecise_sv_functional_annotator.annotate(
+ item=variant_info.sv_info,
+ )
)
except ValueError as error:
sub_note.add_warning(
@@ -352,23 +423,25 @@ def _add_variants(
)
continue
else:
- raise ValueError('VariantInfo should have either the coordinates or the SV info, but had neither!')
+ raise ValueError(
+ "VariantInfo should have either the coordinates or the SV info, but had neither!"
+ )
if len(tx_annotations) == 0:
sub_note.add_warning(
f"Patient {pp.id} has an error with variant {variant_info.variant_key}",
- f"Remove variant from testing... tx_anno == 0",
+ "Remove variant from testing... tx_anno == 0",
)
continue
-
genotype = Genotypes.single(sample_id, gt)
variants.append(
Variant(
- variant_info=variant_info,
- tx_annotations=tx_annotations,
+ variant_info=variant_info,
+ tx_annotations=tx_annotations,
genotypes=genotype,
- ))
+ )
+ )
if len(variants) == 0:
notepad.add_warning(f"Patient {pp.id} has no variants to work with")
@@ -385,39 +458,50 @@ def _extract_variant_info(
sv_info = None
try:
- variant_coordinates = self._coord_finder.find_coordinates(genomic_interpretation)
+ variant_coordinates = self._coord_finder.find_coordinates(
+ genomic_interpretation
+ )
except ValueError:
notepad.add_warning(
- f"Expected a VCF record, a VRS CNV, or an expression with `hgvs.c` but had an error retrieving any from patient {sample_id}",
+ "Expected a VCF record, a VRS CNV, or an expression with `hgvs.c`"
+ f"but had an error retrieving any from patient {sample_id}",
"Remove variant from testing",
)
return None
-
+
if variant_coordinates is None:
sv_info = self._map_to_imprecise_sv(genomic_interpretation)
if sv_info is None:
notepad.add_warning(
- f"Could not extract the information for large SV annotation",
+ "Could not extract the information for large SV annotation",
"Remove variant from testing",
)
return None
-
+
return VariantInfo(
variant_coordinates=variant_coordinates,
sv_info=sv_info,
)
def _map_to_imprecise_sv(
- self,
+ self,
genomic_interpretation: GenomicInterpretation,
) -> typing.Optional[ImpreciseSvInfo]:
- if genomic_interpretation.HasField('variant_interpretation'):
+ if genomic_interpretation.HasField("variant_interpretation"):
variant_interpretation = genomic_interpretation.variant_interpretation
- if variant_interpretation.HasField('variation_descriptor'):
+ if variant_interpretation.HasField("variation_descriptor"):
variation_descriptor = variant_interpretation.variation_descriptor
-
- structural_type = variation_descriptor.structural_type if variation_descriptor.HasField('structural_type') else None
- gene_context = variation_descriptor.gene_context if variation_descriptor.HasField('gene_context') else None
+
+ structural_type = (
+ variation_descriptor.structural_type
+ if variation_descriptor.HasField("structural_type")
+ else None
+ )
+ gene_context = (
+ variation_descriptor.gene_context
+ if variation_descriptor.HasField("gene_context")
+ else None
+ )
if structural_type is not None and gene_context is not None:
st = hpotk.TermId.from_curie(curie=structural_type.id)
@@ -428,23 +512,23 @@ def _map_to_imprecise_sv(
gene_id=gene_context.value_id,
gene_symbol=gene_context.symbol,
)
-
+
return None
-
+
def _map_structural_type_to_variant_class(
self,
structural_type: hpotk.TermId,
) -> VariantClass:
# This method is most likely incomplete.
- # Please open a ticket if you receive a `ValueError`
+ # Please open a ticket if you receive a `ValueError`
# for a structural type, that is not mapped at the moment,
# to help us enhance the mapping.
- if structural_type.prefix == 'SO':
+ if structural_type.prefix == "SO":
if structural_type.id in self._so_deletions:
return VariantClass.DEL
elif structural_type.id in self._so_duplications:
return VariantClass.DUP
else:
- raise ValueError(f'Unknown structural type {structural_type}')
+ raise ValueError(f"Unknown structural type {structural_type}")
else:
- raise ValueError(f'Unknown structural type {structural_type}')
+ raise ValueError(f"Unknown structural type {structural_type}")
diff --git a/tests/analysis/conftest.py b/tests/analysis/conftest.py
index 86dd01c43..90b46c8d8 100644
--- a/tests/analysis/conftest.py
+++ b/tests/analysis/conftest.py
@@ -35,8 +35,9 @@ def degenerated_cohort(
return Cohort.from_patients(
members=(
- Patient(
+ Patient.from_raw_parts(
labels=labels_a,
+ sex=Sex.UNKNOWN_SEX,
phenotypes=(
Phenotype(
term_id=hpotk.TermId.from_curie("HP:0000118"),
@@ -69,8 +70,9 @@ def degenerated_cohort(
),
),
),
- Patient(
+ Patient.from_raw_parts(
labels=labels_b,
+ sex=Sex.UNKNOWN_SEX,
phenotypes=(
Phenotype(
term_id=hpotk.TermId.from_curie("HP:0000118"),
diff --git a/tests/analysis/pcats/test_disease.py b/tests/analysis/pcats/test_disease.py
index 48acece3e..9e16b3002 100644
--- a/tests/analysis/pcats/test_disease.py
+++ b/tests/analysis/pcats/test_disease.py
@@ -4,7 +4,7 @@
from gpsea.model import Cohort
from gpsea.analysis.pcats import DiseaseAnalysis
-from gpsea.analysis.pcats.stats import CountStatistic, ScipyFisherExact
+from gpsea.analysis.pcats.stats import CountStatistic, FisherExactTest
from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
from gpsea.analysis.predicate.phenotype import DiseasePresencePredicate
@@ -13,7 +13,7 @@ class TestDiseaseAnalysis:
@pytest.fixture(scope='class')
def count_statistic(self) -> CountStatistic:
- return ScipyFisherExact()
+ return FisherExactTest()
@pytest.fixture(scope='class')
def suox_disease(self) -> DiseasePresencePredicate:
diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py
index 516bf9fc5..dbd8d508f 100644
--- a/tests/analysis/pcats/test_hpo_term_analysis.py
+++ b/tests/analysis/pcats/test_hpo_term_analysis.py
@@ -7,25 +7,25 @@
from gpsea.analysis.mtc_filter import PhenotypeMtcFilter, HpoMtcFilter
from gpsea.analysis.pcats import HpoTermAnalysis
-from gpsea.analysis.pcats.stats import CountStatistic, ScipyFisherExact
+from gpsea.analysis.pcats.stats import CountStatistic, FisherExactTest
from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate
class TestHpoTermAnalysis:
- @pytest.fixture(scope='class')
+ @pytest.fixture(scope="class")
def count_statistic(self) -> CountStatistic:
- return ScipyFisherExact()
+ return FisherExactTest()
- @pytest.fixture(scope='class')
+ @pytest.fixture(scope="class")
def phenotype_mtc_filter(
self,
hpo: hpotk.MinimalOntology,
) -> PhenotypeMtcFilter:
return HpoMtcFilter.default_filter(
hpo=hpo,
- term_frequency_threshold=.2,
+ term_frequency_threshold=0.2,
)
@pytest.fixture
@@ -37,8 +37,8 @@ def analysis(
return HpoTermAnalysis(
count_statistic=count_statistic,
mtc_filter=phenotype_mtc_filter,
- mtc_correction='fdr_bh',
- mtc_alpha=.05,
+ mtc_correction="fdr_bh",
+ mtc_alpha=0.05,
)
def test_compare_genotype_vs_phenotypes(
@@ -46,7 +46,7 @@ def test_compare_genotype_vs_phenotypes(
analysis: HpoTermAnalysis,
suox_cohort: Cohort,
suox_gt_predicate: GenotypePolyPredicate,
- suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]]
+ suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
):
results = analysis.compare_genotype_vs_phenotypes(
cohort=suox_cohort.all_patients,
@@ -55,4 +55,46 @@ def test_compare_genotype_vs_phenotypes(
)
assert results is not None
- # TODO: improve testing
+
+ assert results.total_tests == 4
+ assert results.n_usable == (35, 18, 13, 25, 23)
+ assert results.pvals == pytest.approx(
+ [
+ 0.0721291631224236,
+ 1.0,
+ float("nan"),
+ 0.35921595820909313,
+ 0.6668461434917216,
+ ],
+ nan_ok=True,
+ )
+ assert results.corrected_pvals is not None
+ assert results.corrected_pvals == pytest.approx(
+ [
+ 0.2885166524896944,
+ 1.0,
+ float("nan"),
+ 0.7184319164181863,
+ 0.8891281913222954,
+ ],
+ nan_ok=True,
+ )
+
+ def test_compare_genotype_vs_phenotypes_explodes_if_no_phenotypes_are_left_after_mtc_filter(
+ self,
+ analysis: HpoTermAnalysis,
+ degenerated_cohort: Cohort,
+ suox_gt_predicate: GenotypePolyPredicate,
+ suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
+ ):
+ with pytest.raises(ValueError) as e:
+ analysis.compare_genotype_vs_phenotypes(
+ cohort=degenerated_cohort,
+ gt_predicate=suox_gt_predicate,
+ pheno_predicates=suox_pheno_predicates,
+ )
+
+ assert (
+ e.value.args[0]
+ == "No phenotypes are left for the analysis after MTC filtering step"
+ )
diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py
index 22b6d5882..7138e93ff 100644
--- a/tests/analysis/predicate/genotype/conftest.py
+++ b/tests/analysis/predicate/genotype/conftest.py
@@ -187,13 +187,12 @@ def patient_w_missense(
sample_labels: SampleLabels,
missense_variant: Variant,
) -> Patient:
- return Patient(
+ return Patient.from_raw_parts(
labels=sample_labels,
+ sex=Sex.UNKNOWN_SEX,
phenotypes=(),
diseases=(),
- variants=(
- missense_variant,
- ),
+ variants=(missense_variant,),
)
@@ -202,11 +201,399 @@ def patient_w_frameshift(
sample_labels: SampleLabels,
frameshift_variant: Variant,
) -> Patient:
- return Patient(
+ return Patient.from_raw_parts(
labels=sample_labels,
+ sex=Sex.UNKNOWN_SEX,
+ phenotypes=(),
+ diseases=(),
+ variants=(frameshift_variant,),
+ )
+
+
+"""
+Genesis family - Autosomal dominant but can also be used as X dominant.
+
+* Adam - father, unaffected
+* Eve - mother, affected
+* Cain - son, affected
+"""
+
+
+@pytest.fixture(scope="package")
+def genesis_mutation(
+ genome_build: GenomeBuild,
+ adam_label: SampleLabels,
+ eve_label: SampleLabels,
+ cain_label: SampleLabels,
+) -> Variant:
+ chr22 = genome_build.contig_by_name("chr22")
+ assert chr22 is not None
+ return Variant(
+ variant_info=VariantInfo(
+ variant_coordinates=VariantCoordinates(
+ region=GenomicRegion(
+ contig=chr22,
+ start=100,
+ end=101,
+ strand=Strand.POSITIVE,
+ ),
+ ref="C",
+ alt="G",
+ change_length=0,
+ )
+ ),
+ tx_annotations=(
+ TranscriptAnnotation(
+ gene_id="a_gene",
+ tx_id="tx:xyz",
+ hgvs_cdna=None,
+ is_preferred=False,
+ variant_effects=(
+ VariantEffect.MISSENSE_VARIANT,
+ VariantEffect.SPLICE_DONOR_VARIANT,
+ ),
+ affected_exons=(4,),
+ protein_id="pt:xyz",
+ hgvsp=None,
+ protein_effect_coordinates=Region(40, 41),
+ ),
+ ),
+ genotypes=Genotypes.from_mapping(
+ {
+ adam_label: Genotype.HOMOZYGOUS_REFERENCE,
+ eve_label: Genotype.HETEROZYGOUS,
+ cain_label: Genotype.HETEROZYGOUS,
+ }
+ ),
+ )
+
+
+@pytest.fixture(scope="package")
+def adam_label() -> SampleLabels:
+ return SampleLabels("Adam")
+
+
+@pytest.fixture(scope="package")
+def adam(
+ adam_label: SampleLabels,
+ genesis_mutation: Variant,
+) -> Patient:
+ return Patient(
+ adam_label,
+ sex=Sex.MALE,
+ phenotypes=(),
+ diseases=(),
+ variants=(genesis_mutation,),
+ )
+
+
+@pytest.fixture(scope="package")
+def eve_label() -> SampleLabels:
+ return SampleLabels("Eve")
+
+
+@pytest.fixture(scope="package")
+def eve(
+ eve_label: SampleLabels,
+ genesis_mutation: Variant,
+) -> Patient:
+ return Patient(
+ eve_label,
+ sex=Sex.FEMALE,
+ phenotypes=(),
+ diseases=(),
+ variants=(genesis_mutation,),
+ )
+
+
+@pytest.fixture(scope="package")
+def cain_label() -> SampleLabels:
+ return SampleLabels("Cain")
+
+
+@pytest.fixture(scope="package")
+def cain(
+ cain_label: SampleLabels,
+ genesis_mutation: Variant,
+) -> Patient:
+ return Patient(
+ cain_label,
+ sex=Sex.MALE,
+ phenotypes=(),
+ diseases=(),
+ variants=(genesis_mutation,),
+ )
+
+
+"""
+White family - Autosomal recessive
+
+* Walt - father, HET
+* Skyler - mother, HET
+* Flynn - son, HOM_ALT
+* Holly - daughter, HOM_REF
+"""
+
+
+@pytest.fixture(scope="package")
+def white_mutation(
+ genome_build: GenomeBuild,
+ walt_label: SampleLabels,
+ skyler_label: SampleLabels,
+ flynn_label: SampleLabels,
+ holly_label: SampleLabels,
+) -> Variant:
+ chr22 = genome_build.contig_by_name("chr22")
+ assert chr22 is not None
+ return Variant(
+ variant_info=VariantInfo(
+ variant_coordinates=VariantCoordinates(
+ region=GenomicRegion(
+ contig=chr22,
+ start=100,
+ end=101,
+ strand=Strand.POSITIVE,
+ ),
+ ref="C",
+ alt="G",
+ change_length=0,
+ )
+ ),
+ tx_annotations=(
+ TranscriptAnnotation(
+ gene_id="a_gene",
+ tx_id="tx:xyz",
+ hgvs_cdna=None,
+ is_preferred=False,
+ variant_effects=(
+ VariantEffect.MISSENSE_VARIANT,
+ VariantEffect.SPLICE_DONOR_VARIANT,
+ ),
+ affected_exons=(4,),
+ protein_id="pt:xyz",
+ hgvsp=None,
+ protein_effect_coordinates=Region(40, 41),
+ ),
+ ),
+ genotypes=Genotypes.from_mapping(
+ {
+ walt_label: Genotype.HETEROZYGOUS,
+ skyler_label: Genotype.HETEROZYGOUS,
+ flynn_label: Genotype.HOMOZYGOUS_ALTERNATE,
+ holly_label: Genotype.HOMOZYGOUS_REFERENCE,
+ }
+ ),
+ )
+
+
+@pytest.fixture(scope="package")
+def walt_label() -> SampleLabels:
+ return SampleLabels("Walt")
+
+
+@pytest.fixture(scope="package")
+def walt(
+ walt_label: SampleLabels,
+ white_mutation: Variant,
+) -> Patient:
+ return Patient(
+ walt_label,
+ sex=Sex.MALE,
+ phenotypes=(),
+ diseases=(),
+ variants=(white_mutation,),
+ )
+
+
+@pytest.fixture(scope="package")
+def skyler_label() -> SampleLabels:
+ return SampleLabels("Skyler")
+
+
+@pytest.fixture(scope="package")
+def skyler(
+ skyler_label: SampleLabels,
+ white_mutation: Variant,
+) -> Patient:
+ return Patient(
+ skyler_label,
+ sex=Sex.FEMALE,
+ phenotypes=(),
+ diseases=(),
+ variants=(white_mutation,),
+ )
+
+
+@pytest.fixture(scope="package")
+def flynn_label() -> SampleLabels:
+ return SampleLabels("Flynn")
+
+
+@pytest.fixture(scope="package")
+def flynn(
+ flynn_label: SampleLabels,
+ white_mutation: Variant,
+) -> Patient:
+ return Patient(
+ flynn_label,
+ sex=Sex.MALE,
+ phenotypes=(),
+ diseases=(),
+ variants=(white_mutation,),
+ )
+
+
+@pytest.fixture(scope="package")
+def holly_label() -> SampleLabels:
+ return SampleLabels("Holly")
+
+
+@pytest.fixture(scope="package")
+def holly(
+ holly_label: SampleLabels,
+ white_mutation: Variant,
+) -> Patient:
+ return Patient(
+ holly_label,
+ sex=Sex.FEMALE,
phenotypes=(),
diseases=(),
- variants=(
- frameshift_variant,
+ variants=(white_mutation,),
+ )
+
+
+"""
+Skywalker family - X-linked recessive
+
+* Anakin - father, homozygous reference (possibly hemizygous reference?)
+* Padme - mother, heterozygous
+* Luke - son, hemizygous
+* Leia - daughter, heterozygous
+"""
+
+
+@pytest.fixture(scope="package")
+def skywalker_mutation(
+ genome_build: GenomeBuild,
+ anakin_label: SampleLabels,
+ padme_label: SampleLabels,
+ luke_label: SampleLabels,
+ leia_label: SampleLabels,
+) -> Variant:
+ chrX = genome_build.contig_by_name("chrX")
+ assert chrX is not None
+ return Variant(
+ variant_info=VariantInfo(
+ variant_coordinates=VariantCoordinates(
+ region=GenomicRegion(
+ contig=chrX,
+ start=100,
+ end=101,
+ strand=Strand.POSITIVE,
+ ),
+ ref="C",
+ alt="G",
+ change_length=0,
+ )
),
+ tx_annotations=(
+ TranscriptAnnotation(
+ gene_id="a_gene",
+ tx_id="tx:xyz",
+ hgvs_cdna=None,
+ is_preferred=False,
+ variant_effects=(
+ VariantEffect.MISSENSE_VARIANT,
+ VariantEffect.SPLICE_DONOR_VARIANT,
+ ),
+ affected_exons=(4,),
+ protein_id="pt:xyz",
+ hgvsp=None,
+ protein_effect_coordinates=Region(40, 41),
+ ),
+ ),
+ genotypes=Genotypes.from_mapping(
+ {
+ anakin_label: Genotype.HOMOZYGOUS_REFERENCE,
+ padme_label: Genotype.HETEROZYGOUS,
+ luke_label: Genotype.HEMIZYGOUS,
+ leia_label: Genotype.HETEROZYGOUS,
+ }
+ ),
+ )
+
+
+@pytest.fixture(scope="package")
+def anakin_label() -> SampleLabels:
+ return SampleLabels("Anakin")
+
+
+@pytest.fixture(scope="package")
+def anakin(
+ anakin_label: SampleLabels,
+ skywalker_mutation: Variant,
+) -> Patient:
+ return Patient(
+ anakin_label,
+ sex=Sex.MALE,
+ phenotypes=(),
+ diseases=(),
+ variants=(skywalker_mutation,),
+ )
+
+
+@pytest.fixture(scope="package")
+def padme_label() -> SampleLabels:
+ return SampleLabels("Padme")
+
+
+@pytest.fixture(scope="package")
+def padme(
+ padme_label: SampleLabels,
+ skywalker_mutation: Variant,
+) -> Patient:
+ return Patient(
+ padme_label,
+ sex=Sex.FEMALE,
+ phenotypes=(),
+ diseases=(),
+ variants=(skywalker_mutation,),
+ )
+
+
+@pytest.fixture(scope="package")
+def luke_label() -> SampleLabels:
+ return SampleLabels("Luke")
+
+
+@pytest.fixture(scope="package")
+def luke(
+ luke_label: SampleLabels,
+ skywalker_mutation: Variant,
+) -> Patient:
+ return Patient(
+ luke_label,
+ sex=Sex.MALE,
+ phenotypes=(),
+ diseases=(),
+ variants=(skywalker_mutation,),
+ )
+
+
+@pytest.fixture(scope="package")
+def leia_label() -> SampleLabels:
+ return SampleLabels("Leia")
+
+
+@pytest.fixture(scope="package")
+def leia(
+ leia_label: SampleLabels,
+ skywalker_mutation: Variant,
+) -> Patient:
+ return Patient(
+ leia_label,
+ sex=Sex.FEMALE,
+ phenotypes=(),
+ diseases=(),
+ variants=(skywalker_mutation,),
)
diff --git a/tests/analysis/predicate/genotype/test_allele_counter.py b/tests/analysis/predicate/genotype/test_allele_counter.py
index 0b4355012..506ec3023 100644
--- a/tests/analysis/predicate/genotype/test_allele_counter.py
+++ b/tests/analysis/predicate/genotype/test_allele_counter.py
@@ -175,8 +175,9 @@ def patient(
hom_alt_lmna: Variant,
hemi_dmd: Variant,
) -> Patient:
- return Patient(
+ return Patient.from_raw_parts(
labels=sample_labels,
+ sex=Sex.UNKNOWN_SEX,
variants=(
het_lmna,
hom_alt_lmna,
diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py
index 3e71283b4..67bc2d117 100644
--- a/tests/analysis/predicate/genotype/test_gt_predicates.py
+++ b/tests/analysis/predicate/genotype/test_gt_predicates.py
@@ -1,26 +1,30 @@
+import typing
+
import pytest
from gpsea.model import *
from gpsea.analysis.predicate.genotype import (
GenotypePolyPredicate,
groups_predicate,
+ filtering_predicate,
VariantPredicates,
+ VariantPredicate,
+ ModeOfInheritancePredicate,
)
-class TestGroupsPredicate:
+TX_ID = "tx:xyz"
- TX_ID = "tx:xyz"
+
+class TestGroupsPredicate:
@pytest.fixture(scope="class")
def predicate(self) -> GenotypePolyPredicate:
return groups_predicate(
predicates=(
+ VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID),
VariantPredicates.variant_effect(
- VariantEffect.MISSENSE_VARIANT, TestGroupsPredicate.TX_ID
- ),
- VariantPredicates.variant_effect(
- VariantEffect.FRAMESHIFT_VARIANT, TestGroupsPredicate.TX_ID
+ VariantEffect.FRAMESHIFT_VARIANT, TX_ID
),
),
group_names=(
@@ -33,8 +37,8 @@ def test_get_question(
self,
predicate: GenotypePolyPredicate,
):
- question = predicate.get_question()
- assert question == "Genotype group: Point, LoF"
+ question = predicate.get_question_base()
+ assert question == "Genotype group"
def test_get_categorizations(
self,
@@ -53,7 +57,7 @@ def test_get_categorizations(
def test_test__missense(
self,
- patient_w_missense: Variant,
+ patient_w_missense: Patient,
predicate: GenotypePolyPredicate,
):
cat = predicate.test(patient_w_missense)
@@ -65,7 +69,7 @@ def test_test__missense(
def test_test__frameshift(
self,
- patient_w_frameshift: Variant,
+ patient_w_frameshift: Patient,
predicate: GenotypePolyPredicate,
):
cat = predicate.test(patient_w_frameshift)
@@ -74,3 +78,188 @@ def test_test__frameshift(
assert cat.category.cat_id == 1
assert cat.category.name == "LoF"
assert cat.category.description == "FRAMESHIFT_VARIANT on tx:xyz"
+
+
+class TestModeOfInheritancePredicate:
+
+ @pytest.fixture(scope="class")
+ def variant_predicate(self) -> VariantPredicate:
+ return VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID)
+
+ @pytest.mark.parametrize(
+ "patient_name,name",
+ [
+ ("adam", "HOM_REF"),
+ ("eve", "HET"),
+ ("cain", "HET"),
+ ],
+ )
+ def test_autosomal_dominant(
+ self,
+ patient_name: str,
+ name: str,
+ variant_predicate: VariantPredicate,
+ request: pytest.FixtureRequest,
+ ):
+ patient = request.getfixturevalue(patient_name)
+ predicate = ModeOfInheritancePredicate.autosomal_dominant(variant_predicate)
+
+ categorization = predicate.test(patient)
+
+ assert categorization is not None
+
+ assert categorization.category.name == name
+
+ @pytest.mark.parametrize(
+ "patient_name,name",
+ [
+ ("walt", "HET"),
+ ("skyler", "HET"),
+ ("flynn", "BIALLELIC_ALT"),
+ ("holly", "HOM_REF"),
+ ],
+ )
+ def test_autosomal_recessive(
+ self,
+ patient_name: str,
+ name: str,
+ variant_predicate: VariantPredicate,
+ request: pytest.FixtureRequest,
+ ):
+ patient = request.getfixturevalue(patient_name)
+ predicate = ModeOfInheritancePredicate.autosomal_recessive(variant_predicate)
+
+ categorization = predicate.test(patient)
+
+ assert categorization is not None
+
+ assert categorization.category.name == name
+
+ @pytest.mark.parametrize(
+ "patient_name,name",
+ [
+ ("adam", "HOM_REF"),
+ ("eve", "HET"),
+ ("cain", "HET"),
+ ],
+ )
+ def test_x_dominant(
+ self,
+ patient_name: str,
+ name: str,
+ variant_predicate: VariantPredicate,
+ request: pytest.FixtureRequest,
+ ):
+ patient = request.getfixturevalue(patient_name)
+ predicate = ModeOfInheritancePredicate.x_dominant(variant_predicate)
+
+ categorization = predicate.test(patient)
+
+ assert categorization is not None
+
+ assert categorization.category.name == name
+
+ @pytest.mark.parametrize(
+ "patient_name,name",
+ [
+ ("anakin", "HOM_REF"),
+ ("padme", "HET"),
+ ("luke", "HEMI"),
+ ("leia", "HET"),
+ ],
+ )
+ def test_x_recessive(
+ self,
+ patient_name: str,
+ name: str,
+ variant_predicate: VariantPredicate,
+ request: pytest.FixtureRequest,
+ ):
+ patient = request.getfixturevalue(patient_name)
+ predicate = ModeOfInheritancePredicate.x_recessive(variant_predicate)
+
+ categorization = predicate.test(patient)
+
+ assert categorization is not None
+
+ assert categorization.category.name == name
+
+
+class TestPolyPredicate:
+
+ @pytest.fixture(scope="class")
+ def x_recessive_gt_predicate(self) -> GenotypePolyPredicate:
+ affects_suox = VariantPredicates.gene("SUOX")
+ return ModeOfInheritancePredicate.x_recessive(
+ variant_predicate=affects_suox,
+ )
+
+ @pytest.mark.parametrize(
+ "indices, expected",
+ [
+ ((0, 1), 2),
+ ((1, 0), 2),
+ ((1, 2), 2),
+ ],
+ )
+ def test_filtering_predicate(
+ self,
+ indices: typing.Sequence[int],
+ expected: int,
+ x_recessive_gt_predicate: GenotypePolyPredicate,
+ ):
+ cats = x_recessive_gt_predicate.get_categorizations()
+ targets = [cats[i] for i in indices]
+ predicate = filtering_predicate(
+ predicate=x_recessive_gt_predicate,
+ targets=targets,
+ )
+
+ actual = len(predicate.get_categorizations())
+
+ assert actual == expected
+
+ def test_filtering_predicate__explodes_when_not_subsetting(
+ self,
+ x_recessive_gt_predicate: GenotypePolyPredicate,
+ ):
+ with pytest.raises(ValueError) as ve:
+ filtering_predicate(
+ predicate=x_recessive_gt_predicate,
+ targets=x_recessive_gt_predicate.get_categorizations(),
+ )
+
+ assert (
+ ve.value.args[0]
+ == "It makes no sense to subset the a predicate with 4 categorizations with the same number (4) of targets"
+ )
+
+ def test_filtering_predicate__explodes_when_using_random_junk(
+ self,
+ x_recessive_gt_predicate: GenotypePolyPredicate,
+ ):
+ with pytest.raises(ValueError) as ve:
+ filtering_predicate(
+ predicate=x_recessive_gt_predicate,
+ targets=(0, 1),
+ )
+
+ assert (
+ ve.value.args[0]
+ == "The targets at following indices are not categorizations: [0, 1]"
+ )
+
+ def test_filtering_predicate__explodes_when_using_one_category(
+ self,
+ x_recessive_gt_predicate: GenotypePolyPredicate,
+ ):
+ with pytest.raises(ValueError) as ve:
+ filtering_predicate(
+ predicate=x_recessive_gt_predicate,
+ targets=(x_recessive_gt_predicate.get_categorizations()[0],),
+ )
+
+ assert (
+ ve.value.args[0]
+ == "At least 2 target categorizations must be provided but got 1"
+ )
diff --git a/tests/analysis/predicate/genotype/test_predicates.py b/tests/analysis/predicate/genotype/test_predicates.py
index 2f78d7185..a076adacf 100644
--- a/tests/analysis/predicate/genotype/test_predicates.py
+++ b/tests/analysis/predicate/genotype/test_predicates.py
@@ -202,6 +202,7 @@ def test_protein_feature_id(
assert predicate.test(missense_variant) == expected
+
class TestLogicalVariantPredicate:
"""
Test that the AND and OR variant predicate combinators work as expected.
diff --git a/tests/analysis/test_de_vries_scorer.py b/tests/analysis/pscore/test_de_vries_scorer.py
similarity index 97%
rename from tests/analysis/test_de_vries_scorer.py
rename to tests/analysis/pscore/test_de_vries_scorer.py
index 220dd18b0..89679a37f 100644
--- a/tests/analysis/test_de_vries_scorer.py
+++ b/tests/analysis/pscore/test_de_vries_scorer.py
@@ -4,7 +4,7 @@
import pytest
from gpsea.analysis.pscore import DeVriesPhenotypeScorer
-from gpsea.model import Patient, SampleLabels, Phenotype
+from gpsea.model import Patient, SampleLabels, Phenotype, Sex
intrauterine_growth_retardation = 'HP:0001511'
small_for_gestational_age = 'HP:0001518'
@@ -69,6 +69,7 @@ def test_a_patient(
):
patient = Patient(
labels=SampleLabels("test"),
+ sex=Sex.UNKNOWN_SEX,
phenotypes=(
Phenotype.from_raw_parts(
term_id=curie,
diff --git a/tests/analysis/test_analysis.py b/tests/analysis/test_analysis.py
deleted file mode 100644
index 5db246218..000000000
--- a/tests/analysis/test_analysis.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import pathlib
-import typing
-
-import hpotk
-import numpy as np
-import pandas as pd
-import pytest
-
-from gpsea.analysis import (
- apply_predicates_on_patients,
- CohortAnalysis, CohortAnalysisConfiguration,
- configure_cohort_analysis,
-)
-
-from gpsea.model import *
-from gpsea.model.genome import *
-from gpsea.analysis.predicate import PatientCategories
-from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicate
-from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate
-
-
-def test_apply_predicates_on_patients(
- suox_cohort: Cohort,
- suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
- suox_gt_predicate: GenotypePolyPredicate,
-):
- categories, n_usable, counts = apply_predicates_on_patients(
- patients=suox_cohort.all_patients,
- pheno_predicates=suox_pheno_predicates,
- gt_predicate=suox_gt_predicate,
- )
-
- assert isinstance(categories, typing.Collection)
- assert PatientCategories.YES in categories
- assert PatientCategories.NO in categories
- assert len(categories) == 2
-
- assert isinstance(n_usable, pd.Series)
- assert len(n_usable) == 5
-
- assert isinstance(counts, typing.Mapping)
- assert len(counts) == 5
-
- seizure_counts = counts[hpotk.TermId.from_curie('HP:0001250')]
- assert np.array_equal(seizure_counts.to_numpy(), np.array([[17, 11], [7, 0]]))
-
-
-class TestCohortAnalysis:
-
- @pytest.fixture
- def cohort_analysis(
- self,
- suox_cohort: Cohort,
- hpo: hpotk.MinimalOntology,
- tmp_path: pathlib.Path,
- ) -> CohortAnalysis:
- return configure_cohort_analysis(
- cohort=suox_cohort,
- hpo=hpo,
- cache_dir=str(tmp_path),
- )
-
- def test_analysis_passes_if_variant_predicate_always_returns_false(
- self,
- cohort_analysis: CohortAnalysis,
- always_false_variant_predicate: VariantPredicate,
- ):
- results = cohort_analysis.compare_hpo_vs_genotype(
- predicate=always_false_variant_predicate
- )
- assert results is not None
-
- def test_analysis_explodes_if_no_phenotypes_are_left_for_analysis(
- self,
- hpo: hpotk.MinimalOntology,
- degenerated_cohort: Cohort,
- tmp_path: pathlib.Path,
- always_false_variant_predicate: VariantPredicate,
- ):
- config = CohortAnalysisConfiguration()
- config.hpo_mtc_strategy()
- cohort_analysis = configure_cohort_analysis(
- hpo=hpo,
- cohort=degenerated_cohort,
- cache_dir=str(tmp_path),
- config=config,
- )
-
- with pytest.raises(ValueError) as e:
- _ = cohort_analysis.compare_hpo_vs_genotype(
- predicate=always_false_variant_predicate,
- )
-
- assert e.value.args[0] == 'No phenotypes are left for the analysis after MTC filtering step'
diff --git a/tests/analysis/test_examples.py b/tests/analysis/test_examples.py
index 536f9907c..8f17161d6 100644
--- a/tests/analysis/test_examples.py
+++ b/tests/analysis/test_examples.py
@@ -11,6 +11,8 @@
from gpsea.model import Cohort, VariantEffect
+# TODO: remove at some point!
+@pytest.mark.skip('Obsolete tests')
class TestCohortAnalysis:
def test_compare_by_variant_effect(
diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py
index c07600ef2..d904cfe43 100644
--- a/tests/analysis/test_mtc_filter.py
+++ b/tests/analysis/test_mtc_filter.py
@@ -1,3 +1,4 @@
+from itertools import count
import typing
import hpotk
@@ -5,11 +6,10 @@
import pandas as pd
import pytest
-from gpsea.analysis import apply_predicates_on_patients
from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter
-from gpsea.analysis.predicate import PatientCategories
from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
-from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate
+from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate
+from gpsea.analysis.pcats import apply_predicates_on_patients
from gpsea.model import Cohort
@@ -31,32 +31,45 @@ def patient_counts(
suox_cohort: Cohort,
suox_gt_predicate: GenotypePolyPredicate,
suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
- ) -> typing.Tuple[
- typing.Mapping[hpotk.TermId, int],
- typing.Mapping[hpotk.TermId, pd.DataFrame],
- ]:
- categories, n_usable, all_counts = apply_predicates_on_patients(
+ ) -> typing.Sequence[pd.DataFrame]:
+ _, counts = apply_predicates_on_patients(
patients=suox_cohort.all_patients,
gt_predicate=suox_gt_predicate,
pheno_predicates=suox_pheno_predicates,
)
- return n_usable, all_counts
+ return counts
@pytest.fixture(scope='class')
- def gt_categories(self) -> pd.Index:
- return pd.Index([PatientCategories.YES, PatientCategories.NO])
+ def gt_predicate(
+ self,
+ suox_gt_predicate: GenotypePolyPredicate,
+ ) -> GenotypePolyPredicate:
+ return suox_gt_predicate
@pytest.fixture(scope='class')
- def pheno_categories(self) -> pd.Index:
- return pd.Index([PatientCategories.YES, PatientCategories.NO])
+ def ph_predicate(
+ self,
+ hpo: hpotk.MinimalOntology,
+ ) -> PhenotypePolyPredicate[hpotk.TermId]:
+ """
+ For the purpose of testing counts, let's pretend the counts
+ were created by this predicate.
+ """
+ return PropagatingPhenotypePredicate(
+ hpo=hpo,
+ query=hpotk.TermId.from_curie("HP:0001250"), # Seizure
+ missing_implies_phenotype_excluded=False,
+ )
@staticmethod
def prepare_counts_df(
counts,
- index: pd.Index,
- columns: pd.Index,
+ gt_predicate: GenotypePolyPredicate,
+ ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
):
values = np.array(counts).reshape((2, 2))
+ index = pd.Index(gt_predicate.get_categories())
+ columns = pd.Index(ph_predicate.get_categories())
return pd.DataFrame(data=values, index=index, columns=columns)
@pytest.mark.parametrize(
@@ -69,17 +82,17 @@ def prepare_counts_df(
]
)
def test_one_genotype_has_zero_hpo_observations(
- self,
- counts: typing.Tuple[int],
- expected: bool,
- gt_categories: pd.Index,
- pheno_categories: pd.Index,
+ self,
+ counts: typing.Tuple[int],
+ expected: bool,
+ gt_predicate: GenotypePolyPredicate,
+ ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
):
- counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories)
+ counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate)
actual = HpoMtcFilter.one_genotype_has_zero_hpo_observations(
counts=counts_df,
- gt_categories=gt_categories,
+ gt_predicate=gt_predicate,
)
assert actual == expected
@@ -101,12 +114,15 @@ def test_some_cell_has_greater_than_one_count(
self,
counts: typing.Tuple[int],
expected: bool,
- gt_categories: pd.Index,
- pheno_categories: pd.Index,
+ gt_predicate: GenotypePolyPredicate,
+ ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
):
- counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories)
+ counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate)
- actual = HpoMtcFilter.some_cell_has_greater_than_one_count(counts=counts_df)
+ actual = HpoMtcFilter.some_cell_has_greater_than_one_count(
+ counts=counts_df,
+ ph_predicate=ph_predicate,
+ )
assert actual == expected
@@ -114,95 +130,127 @@ def test_some_cell_has_greater_than_one_count(
"counts, expected",
[
((10, 100, 50, 500), True),
+ ((0, 0, 100, 100), True),
+ ((10, 100, 50, 500), True),
+ ((10, 100, 10, 105), False),
((95, 60, 144 - 95, 71 - 60), False),
((40, 15, 18, 15), False),
]
)
def test_genotypes_have_same_hpo_proportions(
- self,
- counts: typing.Tuple[int],
- expected: bool,
- gt_categories: pd.Index,
- pheno_categories: pd.Index,
+ self,
+ counts: typing.Tuple[int],
+ expected: bool,
+ gt_predicate: GenotypePolyPredicate,
+ ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
):
- counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_categories, pheno_categories)
+ counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate)
actual = HpoMtcFilter.genotypes_have_same_hpo_proportions(
counts=counts_df,
- gt_categories=gt_categories,
+ gt_predicate=gt_predicate,
+ ph_predicate=ph_predicate,
)
assert actual == expected
- def test_filter_terms_to_test(
- self,
- mtc_filter: HpoMtcFilter,
- suox_gt_predicate: GenotypePolyPredicate,
- patient_counts: typing.Tuple[
- typing.Mapping[hpotk.TermId, int],
- typing.Mapping[hpotk.TermId, pd.DataFrame],
- ],
+ @pytest.mark.parametrize(
+ "counts, expected",
+ [
+ ((1, 2, 99, 198), .01),
+ ((1, 3, 99, 197), .015),
+ ((0, 0, 100, 200), 0.),
+ ((0, 0, 0, 0), 0.),
+ ]
+ )
+ def test_get_maximum_group_observed_HPO_frequency(
+ self,
+ counts: typing.Tuple[int],
+ expected: float,
+ gt_predicate: GenotypePolyPredicate,
+ ph_predicate: PhenotypePolyPredicate[hpotk.TermId],
):
- n_usable, all_counts = patient_counts
- mtc_report = mtc_filter.filter_terms_to_test(
- suox_gt_predicate,
- n_usable,
- all_counts,
+ counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate)
+
+ actual = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+ counts_frame=counts_df,
+ ph_predicate=ph_predicate,
)
- assert isinstance(mtc_report, tuple)
- assert len(mtc_report) == 3
+ assert actual == pytest.approx(expected)
+
+ def test_filter_terms_to_test(
+ self,
+ mtc_filter: HpoMtcFilter,
+ suox_gt_predicate: GenotypePolyPredicate,
+ suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
+ patient_counts: typing.Sequence[pd.DataFrame],
+ ):
+ mtc_report = mtc_filter.filter(
+ gt_predicate=suox_gt_predicate,
+ ph_predicates=suox_pheno_predicates,
+ counts=patient_counts,
+ )
- filtered_n_usable, filtered_all_counts, reason_for_filtering_out = mtc_report
+ assert isinstance(mtc_report, typing.Sequence)
+ assert len(mtc_report) == 5
- assert reason_for_filtering_out['Skipping term because all genotypes have same HPO observed proportions'] == 1
- assert reason_for_filtering_out['Skipping general term'] == 16
- assert reason_for_filtering_out['Skipping non-target term'] == 5
- assert reason_for_filtering_out['Skipping top level term'] == 0
+ is_ok = [r.is_passed() for r in mtc_report]
+ assert is_ok == [True, True, False, True, True]
- assert len(filtered_n_usable) == 4
- assert len(filtered_all_counts) == 4
+ reasons = [r.reason for r in mtc_report]
+ assert reasons == [
+ None, None,
+ 'Skipping term because all genotypes have same HPO observed proportions',
+ None, None,
+ ]
def test_specified_term_mtc_filter(
- self,
- suox_gt_predicate: GenotypePolyPredicate,
- patient_counts: typing.Tuple[
- typing.Mapping[hpotk.TermId, int],
- typing.Mapping[hpotk.TermId, pd.DataFrame],
- ],
+ self,
+ suox_gt_predicate: GenotypePolyPredicate,
+ suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
+ patient_counts: typing.Sequence[pd.DataFrame],
):
"""
The point of this test is to check that if we filter to test only one term ("HP:0032350"), then this
is the only term that should survive the filter. We start with a total of five terms (n_usable==5),
- but after our filter, only one survives (filtered_n_usable == 1), and we have four cases in which the
- reason for filtering out is 'Skipping non-specified term'
+ but after our filter, only one survives, and we have four cases in which the
+ reason for filtering out is 'Non-specified term'
"""
- specified_filter = SpecifiedTermsMtcFilter(terms_to_test={hpotk.TermId.from_curie("HP:0032350")})
- n_usable, all_counts = patient_counts
- mtc_report = specified_filter.filter_terms_to_test(
- suox_gt_predicate,
- n_usable,
- all_counts,
+ specified_filter = SpecifiedTermsMtcFilter(
+ terms_to_test=(hpotk.TermId.from_curie("HP:0032350"),),
+ )
+
+ mtc_report = specified_filter.filter(
+ gt_predicate=suox_gt_predicate,
+ ph_predicates=suox_pheno_predicates,
+ counts=patient_counts,
)
- assert isinstance(mtc_report, tuple)
- assert len(mtc_report) == 3 # # Skipping non-specified term (n=5)
+ assert isinstance(mtc_report, typing.Sequence)
+ assert len(mtc_report) == 5
+
+ is_passed = [r.is_passed() for r in mtc_report]
+ assert is_passed == [
+ False, False, True, False, False,
+ ]
- filtered_n_usable, filtered_all_counts, reason_for_filtering_out = mtc_report
- assert len(n_usable) == 5
- assert len(filtered_n_usable) == 1
- assert reason_for_filtering_out['Skipping non-specified term'] == 4
+ reasons = [r.reason for r in mtc_report]
+ assert reasons == [
+ 'Non-specified term',
+ 'Non-specified term',
+ None,
+ 'Non-specified term',
+ 'Non-specified term',
+ ]
def test_min_observed_HPO_threshold(
self,
- patient_counts: typing.Tuple[
- typing.Mapping[hpotk.TermId, int],
- typing.Mapping[hpotk.TermId, pd.DataFrame],
- ],
+ suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]],
+ patient_counts: typing.Sequence[pd.DataFrame],
):
"""
In our heuristic filter, we only test terms that have at least a threshold
- frequency in at least one of the groups. We use the "all counts" datastructure, that
- is a dictionary whose keys are hpotoolkit TermIds and whose values are pandas DataFrames
+ frequency in at least one of the groups. We use the `patient_counts` - a sequence of DataFrames
with 2x2 contingenicy tables of counts. For instance, each column will have one row for
PatientCategories.YES and one for PatientCategories.NO, indicating counts of measured observed/excluded
HPO phenotypes. Each column is a certain genotype, e.g., MISSENSE or NON-MISSENSE. We want the
@@ -211,28 +259,57 @@ def test_min_observed_HPO_threshold(
for all of the HPO terms in the dictionary.
"""
EPSILON = 0.001
- _, all_counts = patient_counts
+ curie2idx = {
+ p.phenotype.value: i
+ for i, p
+ in enumerate(suox_pheno_predicates)
+ }
# Ectopia lentis HP:0001083 (6 9 1 2), freqs are 6/15=0.4 and 1/3=0.33
- ectopia = all_counts[hpotk.TermId.from_curie("HP:0001083")]
- max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(ectopia)
+ idx = curie2idx["HP:0001083"]
+ ectopia = patient_counts[idx]
+ ectopia_predicate = suox_pheno_predicates[idx]
+ max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+ ectopia,
+ ph_predicate=ectopia_predicate,
+ )
assert max_f == pytest.approx(0.4, abs=EPSILON)
# Seizure HP:0001250 (17 7 11 0), freqs are 17/24=0.7083 and 11/11=1
- seizure = all_counts[hpotk.TermId.from_curie("HP:0001250")]
- max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(seizure)
+ idx = curie2idx["HP:0001250"]
+ seizure = patient_counts[idx]
+ seizure_predicate = suox_pheno_predicates[idx]
+ max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+ seizure,
+ ph_predicate=seizure_predicate
+ )
assert max_f == pytest.approx(1.0, abs=EPSILON)
# Sulfocysteinuria HP:0032350 (11 0 2 0), freqs are both 1
- sulfocysteinuria = all_counts[hpotk.TermId.from_curie("HP:0032350")]
- max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(sulfocysteinuria)
+ idx = curie2idx["HP:0032350"]
+ sulfocysteinuria = patient_counts[idx]
+ sulfocysteinuria_predicate = suox_pheno_predicates[idx]
+ max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+ sulfocysteinuria,
+ ph_predicate=sulfocysteinuria_predicate,
+ )
assert max_f == pytest.approx(1.0, abs=EPSILON)
# Neurodevelopmental delay HP:0012758 (4 13 4 4), freqs are 4/17 = 0.235 and 4/8=0.5
- ndelay = all_counts[hpotk.TermId.from_curie("HP:0012758")]
- max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(ndelay)
+ idx = curie2idx["HP:0012758"]
+ ndelay = patient_counts[idx]
+ ndelay_predicate = suox_pheno_predicates[idx]
+ max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+ ndelay,
+ ph_predicate=ndelay_predicate,
+ )
assert max_f == pytest.approx(0.5, abs=EPSILON)
# Hypertonia HP:0001276 (7 9 4 3) fresa are 7/16=0.4375 and 4/7=0.5714
- hypertonia = all_counts[hpotk.TermId.from_curie("HP:0001276")]
- max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(hypertonia)
+ idx = curie2idx["HP:0001276"]
+ hypertonia = patient_counts[idx]
+ hypertonia_predicate = suox_pheno_predicates[idx]
+ max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency(
+ hypertonia,
+ ph_predicate=hypertonia_predicate,
+ )
assert max_f == pytest.approx(0.5714, abs=EPSILON)
diff --git a/tests/analysis/test_pscore.py b/tests/analysis/test_pscore.py
index 9b6329029..a4c1c24be 100644
--- a/tests/analysis/test_pscore.py
+++ b/tests/analysis/test_pscore.py
@@ -3,7 +3,7 @@
import hpotk
import pytest
-from gpsea.model import Patient, Phenotype, SampleLabels
+from gpsea.model import Patient, Phenotype, SampleLabels, Sex
from gpsea.analysis.pscore import CountingPhenotypeScorer
@@ -50,8 +50,9 @@ def test_a_patient(
expected: int,
counting_scorer: CountingPhenotypeScorer,
):
- patient = Patient(
+ patient = Patient.from_raw_parts(
labels=SampleLabels("test"),
+ sex=Sex.UNKNOWN_SEX,
phenotypes=(
Phenotype(
hpotk.TermId.from_curie(curie),
diff --git a/tests/conftest.py b/tests/conftest.py
index 54f012ecc..b66b55e7d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -268,42 +268,53 @@ def toy_cohort(
genotypes=Genotypes.from_mapping({SampleLabels('LargeCNV'): Genotype.HETEROZYGOUS}))
patients = (
- Patient(SampleLabels('HetSingleVar'),
- phenotypes=(
- test_phenotypes['arachnodactyly_T'],
- test_phenotypes['spasticity_F'],
- test_phenotypes['focal_clonic_seizure_T']),
- variants=(dup,),
- diseases=(test_diseases['KBG_T'],)
- ),
- Patient(SampleLabels('HetDoubleVar1'),
- phenotypes=(
- test_phenotypes['arachnodactyly_T'], test_phenotypes['seizure_T'], test_phenotypes['spasticity_T'],
- ),
- variants=(indel, snv_stop_gain),
- diseases=(test_diseases['KBG_T'],)
- ),
- Patient(SampleLabels('HetDoubleVar2'),
- phenotypes=(
- test_phenotypes['arachnodactyly_F'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'],
- ),
- variants=(snv_missense, del_frameshift),
- diseases=(test_diseases['KBG_T'],)
- ),
- Patient(SampleLabels('HomoVar'),
- phenotypes=(
- test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'],
- ),
- variants=(del_small,),
- diseases=()
- ),
- Patient(SampleLabels('LargeCNV'),
- phenotypes=(
- test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_F'],
- ),
- variants=(del_large,),
- diseases=()
- ),
+ Patient.from_raw_parts(
+ SampleLabels('HetSingleVar'),
+ sex=Sex.UNKNOWN_SEX,
+ phenotypes=(
+ test_phenotypes['arachnodactyly_T'],
+ test_phenotypes['spasticity_F'],
+ test_phenotypes['focal_clonic_seizure_T']
+ ),
+ variants=(dup,),
+ diseases=(test_diseases['KBG_T'],)
+ ),
+ Patient.from_raw_parts(
+ SampleLabels('HetDoubleVar1'),
+ sex=Sex.UNKNOWN_SEX,
+ phenotypes=(
+ test_phenotypes['arachnodactyly_T'], test_phenotypes['seizure_T'], test_phenotypes['spasticity_T'],
+ ),
+ variants=(indel, snv_stop_gain),
+ diseases=(test_diseases['KBG_T'],)
+ ),
+ Patient.from_raw_parts(
+ SampleLabels('HetDoubleVar2'),
+ sex=Sex.UNKNOWN_SEX,
+ phenotypes=(
+ test_phenotypes['arachnodactyly_F'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'],
+ ),
+ variants=(snv_missense, del_frameshift),
+ diseases=(test_diseases['KBG_T'],)
+ ),
+ Patient.from_raw_parts(
+ SampleLabels('HomoVar'),
+ sex=Sex.UNKNOWN_SEX,
+ phenotypes=(
+ test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_T'],
+ ),
+ variants=(del_small,),
+ diseases=()
+ ),
+ Patient.from_raw_parts(
+ SampleLabels('LargeCNV'),
+ sex=Sex.UNKNOWN_SEX,
+ phenotypes=(
+ test_phenotypes['arachnodactyly_T'], test_phenotypes['spasticity_T'], test_phenotypes['seizure_F'],
+ ),
+ variants=(del_large,),
+ diseases=()
+ ),
)
return Cohort.from_patients(patients)
diff --git a/tests/test_data/SUOX.json b/tests/test_data/SUOX.json
index d85873adc..083f77dd6 100644
--- a/tests/test_data/SUOX.json
+++ b/tests/test_data/SUOX.json
@@ -5,6 +5,7 @@
"label": "individual_10_PMID_12112661",
"meta_label": "PMID_36303223_individual_10_PMID_12112661"
},
+ "sex": "UNKNOWN_SEX",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -141,6 +142,7 @@
"label": "individual_11_PMID_12112661",
"meta_label": "PMID_36303223_individual_11_PMID_12112661"
},
+ "sex": "UNKNOWN_SEX",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -392,6 +394,7 @@
"label": "individual_12_PMID_12112661",
"meta_label": "PMID_36303223_individual_12_PMID_12112661"
},
+ "sex": "UNKNOWN_SEX",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -528,6 +531,7 @@
"label": "individual_13_PMID_12112661",
"meta_label": "PMID_36303223_individual_13_PMID_12112661"
},
+ "sex": "UNKNOWN_SEX",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -664,6 +668,7 @@
"label": "individual_14_PMID_11825068",
"meta_label": "PMID_36303223_individual_14_PMID_11825068"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0034332",
@@ -840,6 +845,7 @@
"label": "individual_15_PMID_12001203",
"meta_label": "PMID_36303223_individual_15_PMID_12001203"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -1000,6 +1006,7 @@
"label": "individual_16_PMID_12368985",
"meta_label": "PMID_36303223_individual_16_PMID_12368985"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -1283,6 +1290,7 @@
"label": "individual_17_PMID_15952210",
"meta_label": "PMID_36303223_individual_17_PMID_15952210"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0012758",
@@ -1459,6 +1467,7 @@
"label": "individual_18_PMID_23414711",
"meta_label": "PMID_36303223_individual_18_PMID_23414711"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0012758",
@@ -1639,6 +1648,7 @@
"label": "individual_19_PMID_23452914",
"meta_label": "PMID_36303223_individual_19_PMID_23452914"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0002071",
@@ -1827,6 +1837,7 @@
"label": "individual_1_PMID_9050047",
"meta_label": "PMID_36303223_individual_1_PMID_9050047"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0012758",
@@ -1999,6 +2010,7 @@
"label": "individual_20_PMID_24938149",
"meta_label": "PMID_36303223_individual_20_PMID_24938149"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -2270,6 +2282,7 @@
"label": "individual_21_PMID_27289259",
"meta_label": "PMID_36303223_individual_21_PMID_27289259"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0012758",
@@ -2442,6 +2455,7 @@
"label": "individual_22_PMID_27289259",
"meta_label": "PMID_36303223_individual_22_PMID_27289259"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0012758",
@@ -2614,6 +2628,7 @@
"label": "individual_23_PMID_28725568",
"meta_label": "PMID_36303223_individual_23_PMID_28725568"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -2897,6 +2912,7 @@
"label": "individual_24_PMID_28629418",
"meta_label": "PMID_36303223_individual_24_PMID_28629418"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -3065,6 +3081,7 @@
"label": "individual_25_PMID_31870341",
"meta_label": "PMID_36303223_individual_25_PMID_31870341"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0034332",
@@ -3348,6 +3365,7 @@
"label": "individual_26_PMID_31870341",
"meta_label": "PMID_36303223_individual_26_PMID_31870341"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0034332",
@@ -3631,6 +3649,7 @@
"label": "individual_27_PMID_31870341",
"meta_label": "PMID_36303223_individual_27_PMID_31870341"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0034332",
@@ -3914,6 +3933,7 @@
"label": "individual_28_PMID_33335014",
"meta_label": "PMID_36303223_individual_28_PMID_33335014"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0012758",
@@ -4086,6 +4106,7 @@
"label": "individual_29_PMID_33335014",
"meta_label": "PMID_36303223_individual_29_PMID_33335014"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0012758",
@@ -4262,6 +4283,7 @@
"label": "individual_2_PMID_9600976",
"meta_label": "PMID_36303223_individual_2_PMID_9600976"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0012758",
@@ -4434,6 +4456,7 @@
"label": "individual_30_PMID_31806255",
"meta_label": "PMID_36303223_individual_30_PMID_31806255"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0034332",
@@ -4622,6 +4645,7 @@
"label": "individual_31_PMID_31806255",
"meta_label": "PMID_36303223_individual_31_PMID_31806255"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0034332",
@@ -4810,6 +4834,7 @@
"label": "individual_32_PMID_34025712",
"meta_label": "PMID_36303223_individual_32_PMID_34025712"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -5097,6 +5122,7 @@
"label": "individual_33_PMID_33405344",
"meta_label": "PMID_36303223_individual_33_PMID_33405344"
},
+ "sex": "FEMALE",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -5368,6 +5394,7 @@
"label": "individual_34_PMID_33405344",
"meta_label": "PMID_36303223_individual_34_PMID_33405344"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -5639,6 +5666,7 @@
"label": "individual_35_PMID_36303223",
"meta_label": "PMID_36303223_individual_35_PMID_36303223"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0001083",
@@ -5938,6 +5966,7 @@
"label": "individual_3_PMID_10519592",
"meta_label": "PMID_36303223_individual_3_PMID_10519592"
},
+ "sex": "MALE",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -6229,6 +6258,7 @@
"label": "individual_4_PMID_12112661",
"meta_label": "PMID_36303223_individual_4_PMID_12112661"
},
+ "sex": "UNKNOWN_SEX",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -6365,6 +6395,7 @@
"label": "individual_5_PMID_12112661",
"meta_label": "PMID_36303223_individual_5_PMID_12112661"
},
+ "sex": "UNKNOWN_SEX",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -6616,6 +6647,7 @@
"label": "individual_6_PMID_12112661",
"meta_label": "PMID_36303223_individual_6_PMID_12112661"
},
+ "sex": "UNKNOWN_SEX",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -6752,6 +6784,7 @@
"label": "individual_7_PMID_12112661",
"meta_label": "PMID_36303223_individual_7_PMID_12112661"
},
+ "sex": "UNKNOWN_SEX",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -6888,6 +6921,7 @@
"label": "individual_8_PMID_12112661",
"meta_label": "PMID_36303223_individual_8_PMID_12112661"
},
+ "sex": "UNKNOWN_SEX",
"phenotypes": [
{
"term_id": "HP:0001250",
@@ -7024,6 +7058,7 @@
"label": "individual_9_PMID_12112661",
"meta_label": "PMID_36303223_individual_9_PMID_12112661"
},
+ "sex": "UNKNOWN_SEX",
"phenotypes": [
{
"term_id": "HP:0001250",
diff --git a/tests/test_io.py b/tests/test_io.py
index a479dceed..7d45cabdd 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -24,7 +24,6 @@ def test_regenerate_cohort(
):
"""
The test for regenerating the `SUOX.json` file based on a cohort of phenopackets.
- The test needs path to a folder with phenopacket JSON files (empty `str` below).
Note, the test may need to be run multiple times if the ENSEMBL API times out.
"""
diff --git a/tests/test_tutorial.py b/tests/test_tutorial.py
index 52924eff4..b592f1f85 100644
--- a/tests/test_tutorial.py
+++ b/tests/test_tutorial.py
@@ -6,7 +6,7 @@
from gpsea.preprocessing import configure_caching_cohort_creator, CohortCreator, load_phenopackets
from gpsea.analysis.pcats import HpoTermAnalysis
from gpsea.analysis.mtc_filter import HpoMtcFilter
-from gpsea.analysis.pcats.stats import ScipyFisherExact
+from gpsea.analysis.pcats.stats import FisherExactTest
from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate
from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest
@@ -53,7 +53,7 @@ def hpo_term_analysis(
mtc_filter,
) -> HpoTermAnalysis:
return HpoTermAnalysis(
- count_statistic=ScipyFisherExact(),
+ count_statistic=FisherExactTest(),
mtc_filter=mtc_filter,
mtc_correction='fdr_bh',
mtc_alpha=0.05,