diff --git a/.flake8 b/.flake8 index 4124b2b6..fed4715d 100644 --- a/.flake8 +++ b/.flake8 @@ -3,7 +3,8 @@ max-line-length = 120 # F405 - ignore error if class is imported in `*` import. # W293 - blank line contains whitespace -ignore = F405,W293 +# W503 - line break before binary operator +ignore = F405,W293,W503 exclude = .git, diff --git a/docs/img/phenotype_group_counts.png b/docs/img/phenotype_group_counts.png deleted file mode 100644 index e08a430a..00000000 Binary files a/docs/img/phenotype_group_counts.png and /dev/null differ diff --git a/docs/img/rere_phenotype_score_boxplot.png b/docs/img/rere_phenotype_score_boxplot.png new file mode 100644 index 00000000..684e57ae Binary files /dev/null and b/docs/img/rere_phenotype_score_boxplot.png differ diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index db0572a0..e4a80ba2 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,18 +1,261 @@ "Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,, -,Count,Percent,Count,Percent,p value,Corrected p value -Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,5.6190936213143254e-05,0.0008990549794102921 -Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.00043478260869565214,0.003478260869565217 -Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.0036231884057971015,0.014492753623188406 -Heart block [HP:0012722],0/22,0%,2/2,100%,0.0036231884057971015,0.014492753623188406 -Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.005628510156750059,0.018011232501600187 -Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.01349527665317139,0.03598740440845704 -Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.02560162393963452,0.058517997576307476 -Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.07440639019586388,0.14881278039172777 -Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.1424522583078588,0.2532484592139712 -Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.1687456462342971,0.2699930339748754 -Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.42857142857142855,0.6233766233766234 -Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.5714285714285713,0.7619047619047618 -Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,0.7735491022101784,0.9520604334894502 -Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 -Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 +,Count,Percent,Count,Percent,Corrected p values,p values +Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0008990549794102921,5.6190936213143254e-05 +Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003478260869565217,0.00043478260869565214 +Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.014492753623188406,0.0036231884057971015 +Heart block [HP:0012722],0/22,0%,2/2,100%,0.014492753623188406,0.0036231884057971015 +Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.018011232501600187,0.005628510156750059 +Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.03598740440845704,0.01349527665317139 +Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.058517997576307476,0.02560162393963452 +Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.14881278039172777,0.07440639019586388 +Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2532484592139712,0.1424522583078588 +Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2699930339748754,0.1687456462342971 +Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6233766233766234,0.42857142857142855 +Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.7619047619047618,0.5714285714285713 +Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,0.9520604334894502,0.7735491022101784 Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 +Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 +Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 +Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,, +Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,, +Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, +Abnormal cardiovascular system morphology [HP:0030680],63/63,100%,30/30,100%,, +Abnormality of the cardiovascular system [HP:0001626],65/65,100%,32/32,100%,, +Phenotypic abnormality [HP:0000118],82/82,100%,38/38,100%,, +All [HP:0000001],82/82,100%,38/38,100%,, +Abnormal cardiac atrium morphology [HP:0005120],43/43,100%,20/20,100%,, +Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,, +Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,, +Perimembranous ventricular septal defect [HP:0011682],3/59,5%,3/25,12%,, +Short thumb [HP:0009778],11/41,27%,8/30,27%,, +Aplasia/Hypoplasia of the thumb [HP:0009601],20/20,100%,19/19,100%,, +Aplasia/Hypoplasia of fingers [HP:0006265],22/22,100%,19/19,100%,, +Aplasia/hypoplasia involving bones of the hand [HP:0005927],22/22,100%,19/19,100%,, +Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],55/55,100%,22/22,100%,, +Aplasia/hypoplasia involving bones of the extremities [HP:0045060],55/55,100%,22/22,100%,, +Aplasia/hypoplasia of the extremities [HP:0009815],55/55,100%,22/22,100%,, +Abnormality of limbs [HP:0040064],73/73,100%,34/34,100%,, +Aplasia/hypoplasia involving the skeleton [HP:0009115],56/56,100%,23/23,100%,, +Abnormal skeletal morphology [HP:0011842],73/73,100%,35/35,100%,, +Abnormality of the skeletal system [HP:0000924],73/73,100%,35/35,100%,, +Abnormality of the musculoskeletal system [HP:0033127],74/74,100%,35/35,100%,, +Abnormal limb bone morphology [HP:0002813],63/63,100%,34/34,100%,, +Abnormality of limb bone [HP:0040068],63/63,100%,34/34,100%,, +Abnormal appendicular skeleton morphology [HP:0011844],64/64,100%,34/34,100%,, +Abnormality of the upper limb [HP:0002817],73/73,100%,34/34,100%,, +Abnormal hand morphology [HP:0005922],53/53,100%,20/20,100%,, +Abnormality of the hand [HP:0001155],60/60,100%,31/31,100%,, +Abnormal finger morphology [HP:0001167],36/36,100%,31/31,100%,, +Abnormal digit morphology [HP:0011297],38/38,100%,33/33,100%,, +Abnormal thumb morphology [HP:0001172],30/30,100%,31/31,100%,, +Short finger [HP:0009381],11/11,100%,8/8,100%,, +Short digit [HP:0011927],11/11,100%,10/10,100%,, +Abnormality of thumb phalanx [HP:0009602],13/13,100%,13/13,100%,, +Forearm undergrowth [HP:0009821],30/30,100%,7/7,100%,, +Upper limb undergrowth [HP:0009824],33/33,100%,7/7,100%,, +Limb undergrowth [HP:0009826],33/33,100%,7/7,100%,, +Aplasia/hypoplasia involving forearm bones [HP:0006503],37/37,100%,12/12,100%,, +Abnormal forearm bone morphology [HP:0040072],37/37,100%,14/14,100%,, +Abnormal upper limb bone morphology [HP:0040070],40/40,100%,14/14,100%,, +Abnormal forearm morphology [HP:0002973],37/37,100%,14/14,100%,, +Short long bone [HP:0003026],35/35,100%,9/9,100%,, +Abnormal long bone morphology [HP:0011314],44/44,100%,13/13,100%,, +Aplasia/Hypoplasia of the radius [HP:0006501],37/37,100%,11/11,100%,, +Abnormal morphology of the radius [HP:0002818],37/37,100%,13/13,100%,, +Aplasia of the fingers [HP:0009380],15/15,100%,14/14,100%,, +Tricuspid regurgitation [HP:0005180],3/3,100%,0/0,0%,, +Atrioventricular valve regurgitation [HP:0034376],4/4,100%,2/2,100%,, +Abnormal atrioventricular valve physiology [HP:0031650],4/4,100%,2/2,100%,, +Abnormal heart valve physiology [HP:0031653],4/4,100%,2/2,100%,, +Abnormal cardiovascular system physiology [HP:0011025],23/23,100%,5/5,100%,, +Abnormal tricuspid valve physiology [HP:0031651],3/3,100%,0/0,0%,, +Deviation of the hand or of fingers of the hand [HP:0009484],2/2,100%,2/2,100%,, +Cleft soft palate [HP:0000185],2/2,100%,0/0,0%,, +Abnormal soft palate morphology [HP:0100736],2/2,100%,0/0,0%,, +Abnormal palate morphology [HP:0000174],5/5,100%,0/0,0%,, +Abnormal oral cavity morphology [HP:0000163],5/5,100%,1/1,100%,, +Abnormal oral morphology [HP:0031816],5/5,100%,1/1,100%,, +Abnormality of the mouth [HP:0000153],5/5,100%,1/1,100%,, +Abnormality of the face [HP:0000271],5/5,100%,1/1,100%,, +Abnormality of the head [HP:0000234],5/5,100%,2/2,100%,, +Abnormality of head or neck [HP:0000152],5/5,100%,2/2,100%,, +Cleft palate [HP:0000175],2/2,100%,0/0,0%,, +Orofacial cleft [HP:0000202],2/2,100%,0/0,0%,, +Abnormal carpal morphology [HP:0001191],30/32,94%,0/0,0%,, +Abnormality of the wrist [HP:0003019],30/30,100%,0/0,0%,, +Abnormality of upper limb joint [HP:0009810],30/30,100%,6/6,100%,, +Abnormal joint morphology [HP:0001367],31/31,100%,6/6,100%,, +Mitral valve prolapse [HP:0001634],0/0,0%,1/1,100%,, +Abnormal mitral valve morphology [HP:0001633],0/0,0%,1/1,100%,, +Abnormal atrioventricular valve morphology [HP:0006705],0/0,0%,1/1,100%,, +Abnormal heart valve morphology [HP:0001654],0/0,0%,1/1,100%,, +Mitral regurgitation [HP:0001653],1/1,100%,2/2,100%,, +Abnormal mitral valve physiology [HP:0031481],1/1,100%,2/2,100%,, +Limited pronation/supination of forearm [HP:0006394],0/0,0%,3/3,100%,, +Limited elbow movement [HP:0002996],0/0,0%,4/4,100%,, +Abnormality of the elbow [HP:0009811],0/0,0%,5/5,100%,, +Limitation of joint mobility [HP:0001376],0/0,0%,4/4,100%,, +Abnormality of joint mobility [HP:0011729],1/1,100%,5/5,100%,, +Abnormal joint physiology [HP:0034430],1/1,100%,5/5,100%,, +Abnormal musculoskeletal physiology [HP:0011843],1/1,100%,5/5,100%,, +Abnormality of cardiovascular system electrophysiology [HP:0030956],15/15,100%,3/3,100%,, +Upper limb phocomelia [HP:0009813],8/85,9%,2/37,5%,, +Phocomelia [HP:0009829],8/8,100%,2/2,100%,, +Pre-capillary pulmonary hypertension [HP:0033578],4/4,100%,0/0,0%,, +Elevated pulmonary artery pressure [HP:0004890],4/4,100%,0/0,0%,, +Abnormality of pulmonary circulation [HP:0030875],4/4,100%,0/0,0%,, +Abnormal vascular physiology [HP:0030163],4/4,100%,0/0,0%,, +Abnormality of the vasculature [HP:0002597],10/10,100%,2/2,100%,, +Abnormal respiratory system physiology [HP:0002795],4/4,100%,0/0,0%,, +Abnormality of the respiratory system [HP:0002086],4/4,100%,0/0,0%,, +Aplasia of the 1st metacarpal [HP:0010035],0/0,0%,0/0,0%,, +Aplasia of the proximal phalanges of the hand [HP:0010242],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia of the proximal phalanges of the hand [HP:0009851],0/0,0%,0/0,0%,, +Abnormal proximal phalanx morphology of the hand [HP:0009834],0/0,0%,0/0,0%,, +Abnormal finger phalanx morphology [HP:0005918],3/3,100%,0/0,0%,, +Aplasia/Hypoplasia of the phalanges of the hand [HP:0009767],0/0,0%,0/0,0%,, +Aplasia of the phalanges of the hand [HP:0009802],0/0,0%,0/0,0%,, +Aplasia involving bones of the upper limbs [HP:0009823],0/0,0%,0/0,0%,, +Aplasia involving bones of the extremities [HP:0009825],0/0,0%,0/0,0%,, +Aplasia of metacarpal bones [HP:0010048],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia involving the metacarpal bones [HP:0005914],0/0,0%,0/0,0%,, +Abnormal metacarpal morphology [HP:0005916],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia of the 1st metacarpal [HP:0010026],0/0,0%,0/0,0%,, +Abnormal 1st metacarpal morphology [HP:0010009],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia of the phalanges of the thumb [HP:0009658],0/0,0%,0/0,0%,, +Partial absence of thumb [HP:0009659],0/0,0%,0/0,0%,, +1-2 finger syndactyly [HP:0010704],3/3,100%,1/1,100%,, +Finger syndactyly [HP:0006101],4/4,100%,2/2,100%,, +Syndactyly [HP:0001159],4/4,100%,2/2,100%,, +Complete atrioventricular canal defect [HP:0001674],5/37,14%,3/36,8%,, +Atrioventricular canal defect [HP:0006695],5/5,100%,3/3,100%,, +Persistent left superior vena cava [HP:0005301],2/37,5%,0/0,0%,, +Abnormal superior vena cava morphology [HP:0025575],2/2,100%,0/0,0%,, +Abnormal vena cava morphology [HP:0005345],2/2,100%,0/0,0%,, +Abnormal morphology of the great vessels [HP:0030962],6/6,100%,2/2,100%,, +Abnormal blood vessel morphology [HP:0033353],6/6,100%,2/2,100%,, +Abnormal vascular morphology [HP:0025015],6/6,100%,2/2,100%,, +Abnormal venous morphology [HP:0002624],2/2,100%,0/0,0%,, +Abnormal shoulder morphology [HP:0003043],1/1,100%,1/1,100%,, +Abnormal thorax morphology [HP:0000765],6/6,100%,5/5,100%,, +Abnormal axial skeleton morphology [HP:0009121],8/8,100%,5/5,100%,, +Aplasia/hypoplasia of the humerus [HP:0006507],7/7,100%,4/4,100%,, +Abnormality of the humerus [HP:0003063],7/7,100%,4/4,100%,, +Abnormality of the upper arm [HP:0001454],7/7,100%,4/4,100%,, +Right atrial enlargement [HP:0030718],4/4,100%,0/0,0%,, +Abnormal right atrium morphology [HP:0025580],4/4,100%,0/0,0%,, +Atrial septal dilatation [HP:0011995],4/4,100%,0/0,0%,, +Pectus excavatum [HP:0000767],3/4,75%,2/2,100%,, +Abnormal sternum morphology [HP:0000766],3/3,100%,2/2,100%,, +Postaxial hand polydactyly [HP:0001162],3/4,75%,0/0,0%,, +Postaxial polydactyly [HP:0100259],3/3,100%,0/0,0%,, +Polydactyly [HP:0010442],3/3,100%,0/0,0%,, +Abnormal 5th finger morphology [HP:0004207],4/4,100%,0/0,0%,, +Hand polydactyly [HP:0001161],3/3,100%,0/0,0%,, +Duplication of phalanx of hand [HP:0009997],3/3,100%,0/0,0%,, +Duplication of hand bones [HP:0004275],3/3,100%,0/0,0%,, +Duplication of bones involving the upper extremities [HP:0009142],3/3,100%,0/0,0%,, +High palate [HP:0000218],3/3,100%,0/0,0%,, +Short neck [HP:0000470],3/3,100%,0/0,0%,, +Abnormal neck morphology [HP:0025668],3/3,100%,0/0,0%,, +Abnormality of the neck [HP:0000464],3/3,100%,0/0,0%,, +Abnormality of the cervical spine [HP:0003319],3/3,100%,0/0,0%,, +Abnormality of the vertebral column [HP:0000925],4/4,100%,1/1,100%,, +Shield chest [HP:0000914],3/3,100%,0/0,0%,, +Enlarged thorax [HP:0100625],3/3,100%,0/0,0%,, +Abnormal rib cage morphology [HP:0001547],4/4,100%,0/0,0%,, +Y-shaped metatarsals [HP:0010567],3/3,100%,0/0,0%,, +Abnormal metatarsal morphology [HP:0001832],3/3,100%,0/0,0%,, +Abnormal lower limb bone morphology [HP:0040069],3/3,100%,0/0,0%,, +Abnormality of the lower limb [HP:0002814],3/3,100%,0/0,0%,, +Abnormal foot morphology [HP:0001760],3/3,100%,0/0,0%,, +Upper extremity joint dislocation [HP:0030310],0/0,0%,2/2,100%,, +Joint dislocation [HP:0001373],0/0,0%,2/2,100%,, +Clinodactyly of the 5th finger [HP:0004209],1/1,100%,0/0,0%,, +Finger clinodactyly [HP:0040019],1/1,100%,0/0,0%,, +Clinodactyly [HP:0030084],1/1,100%,0/0,0%,, +Deviation of finger [HP:0004097],1/1,100%,2/2,100%,, +Deviation of the 5th finger [HP:0009179],1/1,100%,0/0,0%,, +Proximal placement of thumb [HP:0009623],0/0,0%,0/0,0%,, +Deviation of the thumb [HP:0009603],0/0,0%,2/2,100%,, +Abnormal 2nd finger morphology [HP:0004100],1/1,100%,0/0,0%,, +Aplasia/Hypoplasia of the 2nd finger [HP:0006264],1/1,100%,0/0,0%,, +Short middle phalanx of finger [HP:0005819],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia of the middle phalanges of the hand [HP:0009843],0/0,0%,0/0,0%,, +Abnormal middle phalanx morphology of the hand [HP:0009833],0/0,0%,0/0,0%,, +Short phalanx of finger [HP:0009803],0/0,0%,0/0,0%,, +Short middle phalanx of the 5th finger [HP:0004220],0/0,0%,0/0,0%,, +Type A brachydactyly [HP:0009370],0/0,0%,0/0,0%,, +Brachydactyly [HP:0001156],0/0,0%,0/0,0%,, +Short 5th finger [HP:0009237],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia of the 5th finger [HP:0006262],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia of the middle phalanx of the 5th finger [HP:0009161],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia of the phalanges of the 5th finger [HP:0009376],0/0,0%,0/0,0%,, +Abnormal 5th finger phalanx morphology [HP:0004213],0/0,0%,0/0,0%,, +Abnormality of the middle phalanx of the 5th finger [HP:0004219],0/0,0%,0/0,0%,, +Congenital malformation of the great arteries [HP:0011603],4/4,100%,2/2,100%,, +Aplasia involving forearm bones [HP:0009822],7/7,100%,6/6,100%,, +Absent forearm bone [HP:0003953],7/7,100%,6/6,100%,, +Aplasia/Hypoplasia of the ulna [HP:0006495],2/2,100%,2/2,100%,, +Abnormal morphology of ulna [HP:0040071],2/2,100%,4/4,100%,, +Hypoplasia of deltoid muscle [HP:0030241],0/0,0%,0/0,0%,, +Shoulder muscle hypoplasia [HP:0008952],0/0,0%,0/0,0%,, +Hypoplasia of the musculature [HP:0009004],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia involving the skeletal musculature [HP:0001460],0/0,0%,0/0,0%,, +Abnormality of muscle size [HP:0030236],0/0,0%,0/0,0%,, +Abnormal skeletal muscle morphology [HP:0011805],2/2,100%,0/0,0%,, +Abnormality of the musculature [HP:0003011],2/2,100%,0/0,0%,, +Aplasia/Hypoplasia involving the shoulder musculature [HP:0001464],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia involving the musculature of the upper limbs [HP:0001467],0/0,0%,0/0,0%,, +Aplasia/Hypoplasia involving the musculature of the extremities [HP:0009128],0/0,0%,0/0,0%,, +Abnormality of the musculature of the limbs [HP:0009127],2/2,100%,0/0,0%,, +Abnormality of the musculature of the upper limbs [HP:0001446],2/2,100%,0/0,0%,, +Abnormality of the shoulder girdle musculature [HP:0001435],0/0,0%,0/0,0%,, +Atrioventricular dissociation [HP:0011709],0/22,0%,1/1,100%,, +Abnormal toe morphology [HP:0001780],0/0,0%,0/0,0%,, +Abnormal toe phalanx morphology [HP:0010161],0/0,0%,0/0,0%,, +Abnormality of the distal phalanges of the toes [HP:0010182],0/0,0%,0/0,0%,, +Patent foramen ovale [HP:0001655],4/40,10%,0/36,0%,, +Synostosis of joints [HP:0100240],1/1,100%,1/1,100%,, +Short 1st metacarpal [HP:0010034],0/30,0%,0/22,0%,, +Short phalanx of the thumb [HP:0009660],0/30,0%,0/22,0%,, +11 pairs of ribs [HP:0000878],1/1,100%,0/0,0%,, +Missing ribs [HP:0000921],1/1,100%,0/0,0%,, +Aplasia/Hypoplasia of the ribs [HP:0006712],1/1,100%,0/0,0%,, +Aplasia/Hypoplasia involving bones of the thorax [HP:0006711],1/1,100%,2/2,100%,, +Aplasia/hypoplasia affecting bones of the axial skeleton [HP:0009122],2/2,100%,2/2,100%,, +Abnormal rib morphology [HP:0000772],1/1,100%,0/0,0%,, +First degree atrioventricular block [HP:0011705],0/22,0%,1/1,100%,, +Micrognathia [HP:0000347],1/1,100%,1/1,100%,, +Aplasia/Hypoplasia of the mandible [HP:0009118],1/1,100%,1/1,100%,, +Aplasia/Hypoplasia involving bones of the skull [HP:0009116],1/1,100%,1/1,100%,, +Abnormal skull morphology [HP:0000929],1/1,100%,2/2,100%,, +Abnormal mandible morphology [HP:0000277],1/1,100%,1/1,100%,, +Abnormal jaw morphology [HP:0030791],1/1,100%,1/1,100%,, +Abnormal facial skeleton morphology [HP:0011821],1/1,100%,1/1,100%,, +Bowed forearm bones [HP:0003956],0/0,0%,1/1,100%,, +Bowing of the arm [HP:0006488],0/0,0%,1/1,100%,, +Bowing of the long bones [HP:0006487],0/0,0%,1/1,100%,, +Abnormal diaphysis morphology [HP:0000940],0/0,0%,1/1,100%,, +Common atrium [HP:0011565],0/83,0%,0/38,0%,, +Unroofed coronary sinus [HP:0031297],0/85,0%,0/38,0%,, +Amelia involving the upper limbs [HP:0009812],0/83,0%,1/37,3%,, +Third degree atrioventricular block [HP:0001709],0/22,0%,1/1,100%,, +Small hypothenar eminence [HP:0010487],2/2,100%,0/0,0%,, +Abnormality of the hypothenar eminence [HP:0010486],2/2,100%,0/0,0%,, +Abnormality of the musculature of the hand [HP:0001421],2/2,100%,0/0,0%,, +Hypoplastic scapulae [HP:0000882],1/1,100%,1/1,100%,, +Aplasia/Hypoplasia of the scapulae [HP:0006713],1/1,100%,1/1,100%,, +Abnormal scapula morphology [HP:0000782],1/1,100%,1/1,100%,, +Left ventricular noncompaction cardiomyopathy [HP:0011664],0/1,0%,1/5,20%,, +Noncompaction cardiomyopathy [HP:0012817],0/0,0%,1/1,100%,, +Cardiomyopathy [HP:0001638],0/0,0%,1/1,100%,, +Abnormal myocardium morphology [HP:0001637],0/0,0%,1/1,100%,, +Sinus bradycardia [HP:0001688],0/0,0%,1/1,100%,, +Abnormal electrophysiology of sinoatrial node origin [HP:0011702],0/0,0%,1/1,100%,, +Arrhythmia [HP:0011675],1/1,100%,1/1,100%,, +Bradycardia [HP:0001662],0/0,0%,1/1,100%,, +Abnormal skin morphology [HP:0011121],0/0,0%,1/1,100%,, +Abnormality of the skin [HP:0000951],0/0,0%,1/1,100%,, +Abnormality of the integument [HP:0001574],0/0,0%,1/1,100%,, +Sinus venosus atrial septal defect [HP:0011567],0/2,0%,1/1,100%,, diff --git a/docs/report/tbx5_mtc_report.html b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html similarity index 91% rename from docs/report/tbx5_mtc_report.html rename to docs/report/tbx5_frameshift_vs_missense.mtc_report.html index 175a0245..78345301 100644 --- a/docs/report/tbx5_mtc_report.html +++ b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html @@ -48,9 +48,9 @@

Phenotype testing report

Phenotype MTC filter: HPO MTC filter

Multiple testing correction: fdr_bh

-

Performed statistical tests for 15 out of the total of 259 HPO terms.

+

Performed statistical tests for 16 out of the total of 259 HPO terms.

- + @@ -61,15 +61,15 @@

Phenotype testing report

- - + + - - + + @@ -135,13 +135,6 @@

Phenotype testing report

- - - - - - - diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 9fad88dd..a2bc3230 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -9,16 +9,16 @@ The tutorial illustrates just one of the many ways GPSEA can be used to characte The :ref:`user-guide` contains details about additional methods and functionalities. -The tutorial presents an analysis of a cohort of individuals with pathogenic variants in *TBX5* leading to -`Holt-Oram syndrome MIM:142900 `_. +The tutorial presents an analysis of a cohort of individuals with pathogenic variants in *TBX5* leading to +`Holt-Oram syndrome MIM:142900 `_. -Holt-Oram syndrome is an autosomal dominant disorder characterized by +Holt-Oram syndrome is an autosomal dominant disorder characterized by upper limb defects, congenital heart defects, and arrhythmias (`PMID:38336121 `_). It has been observed in the literature that congenital defects of the ventricular and atrial septum are more common in the truncating than in the missense variants (`PMID:30552424 `_). Additionally, upper limb defects are more frequent in patients with protein-truncating variants (`PMID:38336121 `_). -To perform the analysis, we curated the literature and created on `GA4GH phenopacket `_ for each +To perform the analysis, we curated the literature and created on `GA4GH phenopacket `_ for each affected individual. The phenopackets are made available in `Phenopacket Store `_. @@ -26,9 +26,9 @@ affected individual. The phenopackets are made available in `Phenopacket Store < The analysis ~~~~~~~~~~~~ -For the analysis, the `MANE `_ transcript +For the analysis, the `MANE `_ transcript (i.e., the "main" biomedically relevant transcript of a gene) should be chosen unless -there is a specific reason not to (which should occur rarely if at all). +there is a specific reason not to (which should occur rarely if at all). In the case of *TBX5* the MANE transcript is `NM_181486.4`. Note that the trascript identifier (`NM_181486`) and the version (`4`) are both required. A good way to find the MANE transcript is to search on the gene symbol (e.g., *TBX5*) in `ClinVar `_ and to @@ -49,7 +49,7 @@ corresponding protein accession `NP_852259.1`. Load HPO ^^^^^^^^ -GPSEA needs HPO to do the analysis. +GPSEA needs HPO to do the analysis. We use HPO toolkit to load HPO version `v2023-10-09`: >>> import hpotk @@ -75,13 +75,13 @@ and stored in `Phenopacket Store >> from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets >>> cohort_creator = configure_caching_cohort_creator(hpo) >>> cohort, validation = load_phenopackets( # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -... phenopackets=phenopackets, +... phenopackets=phenopackets, ... cohort_creator=cohort_creator, ... ) Patients Created: ... @@ -96,7 +96,7 @@ We loaded the patient data into a `cohort` which is ready for the next steps. .. seealso:: - Here we show how to create a :class:`~gpsea.model.Cohort` from phenopackets. + Here we show how to create a :class:`~gpsea.model.Cohort` from phenopackets. See :ref:`input-data` section to learn how to create a cohort from another inputs. @@ -113,7 +113,7 @@ We can now explore the cohort to see how many patients are included. .. raw:: html :file: report/tbx5_cohort_info.html - + .. note:: The report can also be displayed directly in a Jupyter notebook by running:: @@ -123,14 +123,14 @@ We can now explore the cohort to see how many patients are included. Now we can show the distribution of variants with respect to the encoded protein. We first obtain `tx_coordinates` (:class:`~gpsea.model.TranscriptCoordinates`) -and `protein_meta` (:class:`~gpsea.model.ProteinMetadata`) +and `protein_meta` (:class:`~gpsea.model.ProteinMetadata`) with information about the transcript and protein "anatomy": >>> from gpsea.model.genome import GRCh38 >>> from gpsea.preprocessing import configure_protein_metadata_service, VVMultiCoordinateService >>> txc_service = VVMultiCoordinateService(genome_build=GRCh38) >>> pms = configure_protein_metadata_service() ->>> tx_coordinates = txc_service.fetch(tx_id) +>>> tx_coordinates = txc_service.fetch(tx_id) >>> protein_meta = pms.annotate(px_id) and we follow with plotting the diagram of the mutations on the protein: @@ -158,7 +158,7 @@ Prepare genotype and phenotype predicates ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ We will create a predicate to bin patients into group -depending on presence of a missense and frameshift variant to test +depending on presence of a missense and frameshift variant to test if there is a difference between frameshift and non-frameshift variants in the individuals of the *TBX5* cohort. @@ -176,74 +176,99 @@ in the individuals of the *TBX5* cohort. .. note:: - There are many other ways to set up a predicate for testing + There are many other ways to set up a predicate for testing for a GP correlation. See the :ref:`predicates` section to learn more about building a predicate of interest. +The phenotype grouping is based on presence or absence of an HPO term. +We take advantage of the ontology "true path rule" to infer presence +of the ancestor terms for all present HPO terms +(e.g. presence of `Abnormal ventricular septum morphology `_ +in an individual annotated with `Ventricular septal defect `_) +and exclusion of the descendant terms for all excluded terms (e.g. exclusion of +`Motor seizure `_ +in an individual where `Seizure `_ +was excluded): + +>>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest +>>> pheno_predicates = prepare_predicates_for_terms_of_interest( +... cohort=cohort, +... hpo=hpo, +... min_n_of_patients_with_term=2, +... ) -By default, GPSEA will perform one hypothesis test for each HPO term used to annotate more than one individual in the cohort. -This also includes the terms implied by the ontology "true path rule", -which states that presence of a term -(e.g., `Ventricular septal defect `_) -implies presence of all its ancestor terms -(e.g., `Abnormal ventricular septum morphology `_, -`Abnormal cardiac septum morphology `_, -`Abnormal cardiac ventricle morphology `_, ...). -However, testing multiple hypothesis increases the chance of receiving false positive result, -and multiple testing correction must be applied. -See :ref:`mtc` for information about how to perform multiple testing correction with GPSEA. +By default, GPSEA will perform one hypothesis test for each HPO term used to annotate two or more individuals in the cohort +(see ``min_n_of_patients_with_term=2`` above). +Testing multiple hypothesis on the same dataset increases the chance of receiving false positive result. +However, GPSEA simplifies the application of an appropriate multiple testing correction. For general use, we recommend using a combination of a *Phenotype MTC filter* (:class:`~gpsea.analysis.PhenotypeMtcFilter`) with a *multiple testing correction*. -Phenotype MTC filter chooses the HPO terms to test according to several heuristics, which +Phenotype MTC filter chooses the HPO terms to test according to several heuristics, which reduce the multiple testing burden and focus the analysis on the most interesting terms (see :ref:`HPO MTC filter ` for more info). Then the multiple testing correction, such as Bonferroni or Benjamini-Hochberg, is used to control the family-wise error rate or the false discovery rate. +See :ref:`mtc` for more information. -Here we use HPO MTC filter (:meth:`~gpsea.analysis.CohortAnalysisConfiguration.hpo_mtc_strategy`) -along with Benjamini-Hochberg procedure (:meth:`~gpsea.analysis.CohortAnalysisConfiguration.pval_correction`): +In this example, we will use a combination of the HPO MTC filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`) +with Benjamini-Hochberg procedure (``mtc_correction='fdr_bh'``) +with a false discovery control level at (``mtc_alpha=0.05``): ->>> from gpsea.analysis import configure_cohort_analysis, CohortAnalysisConfiguration ->>> config = CohortAnalysisConfiguration() ->>> config.hpo_mtc_strategy() ->>> config.pval_correction = 'fdr_bh' ->>> analysis = configure_cohort_analysis( -... cohort=cohort, -... hpo=hpo, -... config=config, +>>> from gpsea.analysis.mtc_filter import HpoMtcFilter +>>> mtc_filter = HpoMtcFilter.default_filter(hpo, term_frequency_threshold=0.2) +>>> mtc_correction = 'fdr_bh' +>>> mtc_alpha = 0.05 + +Choosing the statistical procedure for assessment of association between genotype and phenotype +groups is the last missing piece of the analysis. We will use Fisher Exact Test: + +>>> from gpsea.analysis.pcats.stats import ScipyFisherExact +>>> count_statistic = ScipyFisherExact() + +and we finalize the analysis setup by putting all components together +into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`: + +>>> from gpsea.analysis.pcats import HpoTermAnalysis +>>> analysis = HpoTermAnalysis( +... count_statistic=count_statistic, +... mtc_filter=mtc_filter, +... mtc_correction=mtc_correction, +... mtc_alpha=mtc_alpha, ... ) Now we can perform the analysis and investigate the results. ->>> result = analysis.compare_genotype_vs_cohort_phenotypes(gt_predicate) +>>> result = analysis.compare_genotype_vs_phenotypes( +... cohort=cohort, +... gt_predicate=gt_predicate, +... pheno_predicates=pheno_predicates, +... ) >>> result.total_tests 16 We only tested 16 HPO terms. This is despite the individuals being collectively annotated with 259 direct and indirect HPO terms ->>> from gpsea.analysis import prepare_hpo_terms_of_interest ->>> terms = prepare_hpo_terms_of_interest(hpo, cohort.all_patients, min_n_of_patients_with_term=2) ->>> len(terms) +>>> len(result.phenotypes) 259 We can show the reasoning behind *not* testing 243 (`259 - 16`) HPO terms by exploring the phenotype MTC filtering report. >>> from gpsea.view import MtcStatsViewer ->>> mtc_viewer = MtcStatsViewer() ->>> mtc_report = mtc_viewer.process(result.mtc_filter_report) ->>> with open('docs/report/tbx5_mtc_report.html', 'w') as fh: # doctest: +SKIP +>>> mtc_viewer = MtcStatsViewer() +>>> mtc_report = mtc_viewer.process(result) +>>> with open('docs/report/tbx5_frameshift_vs_missense.mtc_report.html', 'w') as fh: # doctest: +SKIP ... _ = fh.write(mtc_report) .. raw:: html - :file: report/tbx5_mtc_report.html + :file: report/tbx5_frameshift_vs_missense.mtc_report.html .. - - TODO: + + TODO: Show how to write out the tested HPO terms. and these are the HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: @@ -259,8 +284,8 @@ and these are the HPO terms ordered by the p value corrected with the Benjamini- We see that several HPO terms are significantly associated with presence of a frameshift variant in *TBX5*. For example, `Ventricular septal defect `_ -was observed in 31/60 (52%) patients with a missense variant +was observed in 31/60 (52%) patients with a missense variant but it was observed in 19/19 (100%) patients with a frameshift variant. -Fisher exact test computed a p value of `~0.0000562` -and the p value corrected by Benjamini-Hochberg procedure +Fisher exact test computed a p value of `~0.0000562` +and the p value corrected by Benjamini-Hochberg procedure is `~0.00112`. diff --git a/docs/user-guide/glossary.rst b/docs/user-guide/glossary.rst new file mode 100644 index 00000000..bf8f9d90 --- /dev/null +++ b/docs/user-guide/glossary.rst @@ -0,0 +1,89 @@ +.. _glossary: + +======== +Glossary +======== + +The glossary summarizes several frequently used concepts. + +.. _true-path-rule: + +True path rule +~~~~~~~~~~~~~~ + +The true path rule of ontologies states that an item (e.g. an individual) annotated with an ontology term +(e.g. `Focal-onset seizure `_) +is implicitly annotated with all its *ancestor* terms +(`Seizure `_, +`Abnormal nervous system physiology `_, ...). +Conversely, exclusion of a term (e.g. `Abnormal ventricular septum morphology `_) +implies exclusion of all its *descendants* +(`Ventricular septal defect `_, +`Ventricular septal aneurysm `_, ...). + + +.. _length-of-the-reference-allele: + +Length of the reference allele +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The length of the REF allele corresponds to the length of the genomic region affected by the variant. + +Let's show a few examples. + +>>> from gpsea.model import VariantCoordinates +>>> from gpsea.model.genome import GRCh38 +>>> chr1 = GRCh38.contig_by_name("chr1") + +The length of the reference allele of a missense variant is 1 +because the variant affects a 1-bp region spanned by the ``C`` nucleotide: + +>>> missense = VariantCoordinates.from_vcf_literal(chr1, 1001, 'C', 'T') +>>> len(missense) +1 + +The length of a "small" deletion is the same as the length of the ref allele `str`: +(``'CCC'`` in the example below): + +>>> deletion = VariantCoordinates.from_vcf_literal(chr1, 1001, 'CCC', 'C') +>>> len(deletion) +3 + +This is because the literal notation spells out the alleles. +However, this simple rule does not apply in symbolic notation. +Here, the REF length corresponds to the length of the allele region. + +For instance, for the following structural variant:: + + #CHROM POS ID REF ALT QUAL FILTER INFO + 1 1001 . C 6 PASS SVTYPE=DEL;END=1003;SVLEN=-3 + +the length of the REF allele is `3`: + +>>> sv_deletion = VariantCoordinates.from_vcf_symbolic( +... chr1, pos=1001, end=1003, +... ref='C', alt='', svlen=-3, +... ) +>>> len(sv_deletion) +3 + +because the deletion removes 3 base pairs at the coordinates :math:`[1001, 1003]`. + + +.. _change-length-of-an-allele: + +Change length of an allele +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Change length is defined as the difference between lengths of the *alt* and *ref* alleles. +SNVs lead to change length of zero, deletions and insertions/duplications lead to negative +and positive change lengths, respectively. See the table below for examples. + +================== ================== =============== + Reference allele Alternate allele Change length +================== ================== =============== + C T 0 + CG AT 0 + ACTG A -3 + A AAT 2 +================== ================== =============== diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index 923b1834..9f7933ee 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -13,6 +13,7 @@ TODO - write high level overview and bridge to individual sections. input-data exploratory predicates - mtc stats + mtc + glossary autosomal_recessive diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst index 30ee95df..7aa3e23e 100644 --- a/docs/user-guide/mtc.rst +++ b/docs/user-guide/mtc.rst @@ -104,6 +104,8 @@ This is how we can set an alternative MTC correction procedure: 'fdr_bh' +.. _mtc-filters: + MTC filters: Choosing which terms to test ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv new file mode 100644 index 00000000..d794cec4 --- /dev/null +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -0,0 +1,262 @@ +FRAMESHIFT_VARIANT on NM_181486.4,Yes,Yes,No,No,, +,Count,Percent,Count,Percent,Corrected p values,p values +Ventricular septal defect [HP:0001629],19/19,100%,42/71,59%,0.004112753923262261,0.00024192670136836828 +Abnormal atrioventricular conduction [HP:0005150],3/3,100%,1/23,4%,0.013076923076923076,0.0015384615384615385 +Absent thumb [HP:0009777],14/31,45%,18/100,18%,0.021044138590779585,0.0037136715160199273 +Atrioventricular block [HP:0001678],2/2,100%,1/23,4%,0.034,0.010000000000000002 +Heart block [HP:0012722],2/2,100%,1/23,4%,0.034,0.010000000000000002 +Patent ductus arteriosus [HP:0001643],2/2,100%,6/40,15%,0.09214092140921408,0.03252032520325203 +Secundum atrial septal defect [HP:0001684],4/22,18%,23/55,42%,0.1440020479198931,0.06544319142266644 +Triphalangeal thumb [HP:0001199],13/32,41%,23/99,23%,0.1440020479198931,0.06932119159387057 +Cardiac conduction abnormality [HP:0031546],3/3,100%,15/37,41%,0.1440020479198931,0.08259109311740892 +Muscular ventricular septal defect [HP:0011623],6/25,24%,8/84,10%,0.1440020479198931,0.08470708701170182 +Pulmonary arterial hypertension [HP:0002092],0/2,0%,8/14,57%,0.6899307928951144,0.4666666666666667 +Short thumb [HP:0009778],8/30,27%,25/69,36%,0.6899307928951144,0.4870099714553749 +Absent radius [HP:0003974],6/25,24%,9/43,21%,1.0,0.7703831604944444 +Atrial septal defect [HP:0001631],20/20,100%,63/65,97%,1.0,1.0 +Hypoplasia of the radius [HP:0002984],6/14,43%,34/75,45%,1.0,1.0 +Short humerus [HP:0005792],4/9,44%,8/21,38%,1.0,1.0 +Hypoplasia of the ulna [HP:0003022],2/10,20%,3/17,18%,1.0,1.0 +Aplasia/Hypoplasia of the thumb [HP:0009601],19/19,100%,40/40,100%,, +Aplasia/Hypoplasia of fingers [HP:0006265],19/19,100%,44/44,100%,, +Aplasia/hypoplasia involving bones of the hand [HP:0005927],19/19,100%,44/44,100%,, +Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],22/22,100%,78/78,100%,, +Aplasia/hypoplasia involving bones of the extremities [HP:0045060],22/22,100%,78/78,100%,, +Aplasia/hypoplasia of the extremities [HP:0009815],22/22,100%,78/78,100%,, +Abnormality of limbs [HP:0040064],34/34,100%,102/102,100%,, +Phenotypic abnormality [HP:0000118],38/38,100%,114/114,100%,, +All [HP:0000001],38/38,100%,114/114,100%,, +Aplasia/hypoplasia involving the skeleton [HP:0009115],23/23,100%,80/80,100%,, +Abnormal skeletal morphology [HP:0011842],35/35,100%,103/103,100%,, +Abnormality of the skeletal system [HP:0000924],35/35,100%,103/103,100%,, +Abnormality of the musculoskeletal system [HP:0033127],35/35,100%,104/104,100%,, +Abnormal limb bone morphology [HP:0002813],34/34,100%,92/92,100%,, +Abnormality of limb bone [HP:0040068],34/34,100%,92/92,100%,, +Abnormal appendicular skeleton morphology [HP:0011844],34/34,100%,93/93,100%,, +Abnormality of the upper limb [HP:0002817],34/34,100%,102/102,100%,, +Abnormal hand morphology [HP:0005922],20/20,100%,75/75,100%,, +Abnormality of the hand [HP:0001155],31/31,100%,88/88,100%,, +Abnormal finger morphology [HP:0001167],31/31,100%,64/64,100%,, +Abnormal digit morphology [HP:0011297],33/33,100%,67/67,100%,, +Abnormal thumb morphology [HP:0001172],31/31,100%,58/58,100%,, +Short finger [HP:0009381],8/8,100%,27/27,100%,, +Short digit [HP:0011927],10/10,100%,28/28,100%,, +Abnormality of thumb phalanx [HP:0009602],13/13,100%,26/26,100%,, +Congenital malformation of the great arteries [HP:0011603],2/2,100%,7/7,100%,, +Abnormal morphology of the great vessels [HP:0030962],2/2,100%,10/10,100%,, +Abnormal blood vessel morphology [HP:0033353],2/2,100%,11/11,100%,, +Abnormal vascular morphology [HP:0025015],2/2,100%,11/11,100%,, +Abnormal cardiovascular system morphology [HP:0030680],30/30,100%,92/92,100%,, +Abnormality of the cardiovascular system [HP:0001626],32/32,100%,94/94,100%,, +Abnormality of the vasculature [HP:0002597],2/2,100%,17/17,100%,, +Aplasia of the 1st metacarpal [HP:0010035],0/0,0%,3/3,100%,, +Aplasia of the proximal phalanges of the hand [HP:0010242],0/0,0%,3/3,100%,, +Aplasia/Hypoplasia of the proximal phalanges of the hand [HP:0009851],0/0,0%,3/3,100%,, +Abnormal proximal phalanx morphology of the hand [HP:0009834],0/0,0%,3/3,100%,, +Abnormal finger phalanx morphology [HP:0005918],0/0,0%,9/9,100%,, +Aplasia/Hypoplasia of the phalanges of the hand [HP:0009767],0/0,0%,6/6,100%,, +Aplasia of the phalanges of the hand [HP:0009802],0/0,0%,3/3,100%,, +Aplasia involving bones of the upper limbs [HP:0009823],0/0,0%,3/3,100%,, +Aplasia involving bones of the extremities [HP:0009825],0/0,0%,3/3,100%,, +Finger aplasia [HP:0009380],14/14,100%,23/23,100%,, +Aplasia of metacarpal bones [HP:0010048],0/0,0%,3/3,100%,, +Aplasia/Hypoplasia involving the metacarpal bones [HP:0005914],0/0,0%,4/4,100%,, +Abnormal metacarpal morphology [HP:0005916],0/0,0%,4/4,100%,, +Abnormal upper limb bone morphology [HP:0040070],14/14,100%,50/50,100%,, +Aplasia/Hypoplasia of the 1st metacarpal [HP:0010026],0/0,0%,4/4,100%,, +Abnormal 1st metacarpal morphology [HP:0010009],0/0,0%,4/4,100%,, +Aplasia/Hypoplasia of the phalanges of the thumb [HP:0009658],0/0,0%,4/4,100%,, +Partial absence of thumb [HP:0009659],0/0,0%,3/3,100%,, +Abnormal atrial septum morphology [HP:0011994],20/20,100%,64/64,100%,, +Abnormal cardiac septum morphology [HP:0001671],28/28,100%,89/89,100%,, +Abnormal heart morphology [HP:0001627],30/30,100%,89/89,100%,, +Abnormal cardiac atrium morphology [HP:0005120],20/20,100%,64/64,100%,, +Perimembranous ventricular septal defect [HP:0011682],3/25,12%,6/84,7%,, +Abnormal ventricular septum morphology [HP:0010438],19/19,100%,42/42,100%,, +Abnormal cardiac ventricle morphology [HP:0001713],19/19,100%,43/43,100%,, +Forearm undergrowth [HP:0009821],7/7,100%,35/35,100%,, +Upper limb undergrowth [HP:0009824],7/7,100%,38/38,100%,, +Limb undergrowth [HP:0009826],7/7,100%,38/38,100%,, +Aplasia/hypoplasia involving forearm bones [HP:0006503],12/12,100%,43/43,100%,, +Abnormal forearm bone morphology [HP:0040072],14/14,100%,43/43,100%,, +Abnormal forearm morphology [HP:0002973],14/14,100%,43/43,100%,, +Short long bone [HP:0003026],9/9,100%,41/41,100%,, +Abnormal long bone morphology [HP:0011314],13/13,100%,50/50,100%,, +Aplasia/Hypoplasia of the radius [HP:0006501],11/11,100%,43/43,100%,, +Abnormal morphology of the radius [HP:0002818],13/13,100%,43/43,100%,, +Abnormal carpal morphology [HP:0001191],0/0,0%,30/32,94%,, +Abnormality of the wrist [HP:0003019],0/0,0%,30/30,100%,, +Abnormality of upper limb joint [HP:0009810],6/6,100%,32/32,100%,, +Abnormal joint morphology [HP:0001367],6/6,100%,33/33,100%,, +Persistent left superior vena cava [HP:0005301],0/0,0%,4/39,10%,, +Abnormal superior vena cava morphology [HP:0025575],0/0,0%,4/4,100%,, +Abnormal vena cava morphology [HP:0005345],0/0,0%,4/4,100%,, +Abnormal venous morphology [HP:0002624],0/0,0%,4/4,100%,, +Tricuspid regurgitation [HP:0005180],0/0,0%,5/5,100%,, +Atrioventricular valve regurgitation [HP:0034376],2/2,100%,7/7,100%,, +Abnormal atrioventricular valve physiology [HP:0031650],2/2,100%,7/7,100%,, +Abnormal heart valve physiology [HP:0031653],2/2,100%,7/7,100%,, +Abnormal cardiovascular system physiology [HP:0011025],5/5,100%,30/30,100%,, +Abnormal tricuspid valve physiology [HP:0031651],0/0,0%,5/5,100%,, +Micrognathia [HP:0000347],1/1,100%,1/1,100%,, +Aplasia/Hypoplasia of the mandible [HP:0009118],1/1,100%,1/1,100%,, +Aplasia/Hypoplasia involving bones of the skull [HP:0009116],1/1,100%,1/1,100%,, +Aplasia/hypoplasia affecting bones of the axial skeleton [HP:0009122],2/2,100%,3/3,100%,, +Abnormal axial skeleton morphology [HP:0009121],5/5,100%,9/9,100%,, +Abnormal skull morphology [HP:0000929],2/2,100%,1/1,100%,, +Abnormality of the head [HP:0000234],2/2,100%,5/5,100%,, +Abnormality of head or neck [HP:0000152],2/2,100%,5/5,100%,, +Abnormal mandible morphology [HP:0000277],1/1,100%,1/1,100%,, +Abnormal jaw morphology [HP:0030791],1/1,100%,1/1,100%,, +Abnormal facial skeleton morphology [HP:0011821],1/1,100%,1/1,100%,, +Cleft soft palate [HP:0000185],0/0,0%,2/2,100%,, +Abnormal soft palate morphology [HP:0100736],0/0,0%,2/2,100%,, +Abnormal palate morphology [HP:0000174],0/0,0%,5/5,100%,, +Abnormal oral cavity morphology [HP:0000163],1/1,100%,5/5,100%,, +Abnormal oral morphology [HP:0031816],1/1,100%,5/5,100%,, +Abnormality of the mouth [HP:0000153],1/1,100%,5/5,100%,, +Abnormality of the face [HP:0000271],1/1,100%,5/5,100%,, +Cleft palate [HP:0000175],0/0,0%,2/2,100%,, +Orofacial cleft [HP:0000202],0/0,0%,2/2,100%,, +Craniofacial cleft [HP:5201015],0/0,0%,2/2,100%,, +Aplasia involving forearm bones [HP:0009822],6/6,100%,9/9,100%,, +Absent forearm bone [HP:0003953],6/6,100%,9/9,100%,, +Aplasia/hypoplasia of the humerus [HP:0006507],4/4,100%,8/8,100%,, +Abnormality of the humerus [HP:0003063],4/4,100%,8/8,100%,, +Abnormality of the upper arm [HP:0001454],4/4,100%,8/8,100%,, +1-2 finger syndactyly [HP:0010704],1/1,100%,4/4,100%,, +Finger syndactyly [HP:0006101],2/2,100%,5/5,100%,, +Syndactyly [HP:0001159],2/2,100%,5/5,100%,, +Complete atrioventricular canal defect [HP:0001674],3/36,8%,6/67,9%,, +Atrioventricular canal defect [HP:0006695],3/3,100%,6/6,100%,, +Upper limb phocomelia [HP:0009813],2/37,5%,8/116,7%,, +Phocomelia [HP:0009829],2/2,100%,8/8,100%,, +Abnormality of cardiovascular system electrophysiology [HP:0030956],3/3,100%,18/18,100%,, +Atrioventricular dissociation [HP:0011709],1/1,100%,0/22,0%,, +Abnormal morphology of ulna [HP:0040071],4/4,100%,4/4,100%,, +Aplasia/Hypoplasia of the ulna [HP:0006495],2/2,100%,4/4,100%,, +Mitral valve prolapse [HP:0001634],1/1,100%,1/1,100%,, +Abnormal mitral valve morphology [HP:0001633],1/1,100%,1/1,100%,, +Abnormal atrioventricular valve morphology [HP:0006705],1/1,100%,1/1,100%,, +Abnormal heart valve morphology [HP:0001654],1/1,100%,1/1,100%,, +Proximal placement of thumb [HP:0009623],0/0,0%,3/3,100%,, +Deviation of the thumb [HP:0009603],2/2,100%,3/3,100%,, +Deviation of finger [HP:0004097],2/2,100%,4/4,100%,, +Deviation of the hand or of fingers of the hand [HP:0009484],2/2,100%,5/5,100%,, +Short 1st metacarpal [HP:0010034],0/22,0%,1/45,2%,, +Short phalanx of the thumb [HP:0009660],0/22,0%,1/45,2%,, +Sinus bradycardia [HP:0001688],1/1,100%,2/2,100%,, +Abnormal electrophysiology of sinoatrial node origin [HP:0011702],1/1,100%,2/2,100%,, +Arrhythmia [HP:0011675],1/1,100%,3/3,100%,, +Bradycardia [HP:0001662],1/1,100%,2/2,100%,, +Hypoplasia of deltoid muscle [HP:0030241],0/0,0%,6/6,100%,, +Shoulder muscle hypoplasia [HP:0008952],0/0,0%,6/6,100%,, +Hypoplasia of the musculature [HP:0009004],0/0,0%,6/6,100%,, +Aplasia/Hypoplasia involving the skeletal musculature [HP:0001460],0/0,0%,6/6,100%,, +Abnormality of muscle size [HP:0030236],0/0,0%,6/6,100%,, +Abnormal skeletal muscle morphology [HP:0011805],0/0,0%,8/8,100%,, +Abnormality of the musculature [HP:0003011],0/0,0%,8/8,100%,, +Aplasia/Hypoplasia involving the shoulder musculature [HP:0001464],0/0,0%,6/6,100%,, +Aplasia/Hypoplasia involving the musculature of the upper limbs [HP:0001467],0/0,0%,6/6,100%,, +Aplasia/Hypoplasia involving the musculature of the extremities [HP:0009128],0/0,0%,6/6,100%,, +Abnormality of the musculature of the limbs [HP:0009127],0/0,0%,8/8,100%,, +Abnormality of the musculature of the upper limbs [HP:0001446],0/0,0%,8/8,100%,, +Abnormality of the shoulder girdle musculature [HP:0001435],0/0,0%,6/6,100%,, +Patent foramen ovale [HP:0001655],0/36,0%,4/69,6%,, +Synostosis of joints [HP:0100240],1/1,100%,1/1,100%,, +Abnormality of joint mobility [HP:0011729],5/5,100%,3/3,100%,, +Abnormal joint physiology [HP:0034430],5/5,100%,3/3,100%,, +Abnormal musculoskeletal physiology [HP:0011843],5/5,100%,3/3,100%,, +Abnormality of the vertebral column [HP:0000925],1/1,100%,4/4,100%,, +Upper extremity joint dislocation [HP:0030310],2/2,100%,0/0,0%,, +Joint dislocation [HP:0001373],2/2,100%,0/0,0%,, +Abnormal shoulder morphology [HP:0003043],1/1,100%,1/1,100%,, +Abnormal thorax morphology [HP:0000765],5/5,100%,7/7,100%,, +Limited elbow movement [HP:0002996],4/4,100%,2/2,100%,, +Abnormality of the elbow [HP:0009811],5/5,100%,2/2,100%,, +Limitation of joint mobility [HP:0001376],4/4,100%,2/2,100%,, +Mitral regurgitation [HP:0001653],2/2,100%,2/2,100%,, +Abnormal mitral valve physiology [HP:0031481],2/2,100%,2/2,100%,, +Pre-capillary pulmonary hypertension [HP:0033578],0/0,0%,8/8,100%,, +Elevated pulmonary artery pressure [HP:0004890],0/0,0%,8/8,100%,, +Abnormality of pulmonary circulation [HP:0030875],0/0,0%,8/8,100%,, +Abnormal vascular physiology [HP:0030163],0/0,0%,8/8,100%,, +Abnormal respiratory system physiology [HP:0002795],0/0,0%,8/8,100%,, +Abnormality of the respiratory system [HP:0002086],0/0,0%,8/8,100%,, +Right atrial enlargement [HP:0030718],0/0,0%,4/4,100%,, +Abnormal right atrium morphology [HP:0025580],0/0,0%,4/4,100%,, +Atrial septal dilatation [HP:0011995],0/0,0%,4/4,100%,, +Pectus excavatum [HP:0000767],2/2,100%,3/4,75%,, +Abnormal sternum morphology [HP:0000766],2/2,100%,3/3,100%,, +Postaxial hand polydactyly [HP:0001162],0/0,0%,3/4,75%,, +Postaxial polydactyly [HP:0100259],0/0,0%,3/3,100%,, +Polydactyly [HP:0010442],0/0,0%,3/3,100%,, +Abnormal 5th finger morphology [HP:0004207],0/0,0%,6/6,100%,, +Hand polydactyly [HP:0001161],0/0,0%,3/3,100%,, +Duplication of phalanx of hand [HP:0009997],0/0,0%,3/3,100%,, +Duplication of hand bones [HP:0004275],0/0,0%,3/3,100%,, +Duplication of bones involving the upper extremities [HP:0009142],0/0,0%,3/3,100%,, +High palate [HP:0000218],0/0,0%,3/3,100%,, +Short neck [HP:0000470],0/0,0%,3/3,100%,, +Abnormal neck morphology [HP:0025668],0/0,0%,3/3,100%,, +Abnormality of the neck [HP:0000464],0/0,0%,3/3,100%,, +Abnormality of the cervical spine [HP:0003319],0/0,0%,3/3,100%,, +Shield chest [HP:0000914],0/0,0%,3/3,100%,, +Enlarged thorax [HP:0100625],0/0,0%,3/3,100%,, +Abnormal rib cage morphology [HP:0001547],0/0,0%,5/5,100%,, +Y-shaped metatarsals [HP:0010567],0/0,0%,3/3,100%,, +Abnormal metatarsal morphology [HP:0001832],0/0,0%,3/3,100%,, +Abnormal lower limb bone morphology [HP:0040069],0/0,0%,4/4,100%,, +Abnormality of the lower limb [HP:0002814],0/0,0%,4/4,100%,, +Abnormal foot morphology [HP:0001760],0/0,0%,4/4,100%,, +Common atrium [HP:0011565],0/38,0%,1/115,1%,, +Unroofed coronary sinus [HP:0031297],0/38,0%,1/117,1%,, +Limited pronation/supination of forearm [HP:0006394],3/3,100%,2/2,100%,, +First degree atrioventricular block [HP:0011705],1/1,100%,1/23,4%,, +Hypoplastic scapulae [HP:0000882],1/1,100%,1/1,100%,, +Aplasia/Hypoplasia of the scapulae [HP:0006713],1/1,100%,1/1,100%,, +Aplasia/Hypoplasia involving bones of the thorax [HP:0006711],2/2,100%,2/2,100%,, +Abnormal scapula morphology [HP:0000782],1/1,100%,1/1,100%,, +Clinodactyly of the 5th finger [HP:0004209],0/0,0%,2/2,100%,, +Finger clinodactyly [HP:0040019],0/0,0%,2/2,100%,, +Clinodactyly [HP:0030084],0/0,0%,2/2,100%,, +Deviation of the 5th finger [HP:0009179],0/0,0%,2/2,100%,, +Abnormal 2nd finger morphology [HP:0004100],0/0,0%,2/2,100%,, +Aplasia/Hypoplasia of the 2nd finger [HP:0006264],0/0,0%,2/2,100%,, +Short middle phalanx of finger [HP:0005819],0/0,0%,2/2,100%,, +Aplasia/Hypoplasia of the middle phalanges of the hand [HP:0009843],0/0,0%,2/2,100%,, +Abnormal middle phalanx morphology of the hand [HP:0009833],0/0,0%,2/2,100%,, +Short phalanx of finger [HP:0009803],0/0,0%,2/2,100%,, +Short middle phalanx of the 5th finger [HP:0004220],0/0,0%,2/2,100%,, +Type A brachydactyly [HP:0009370],0/0,0%,2/2,100%,, +Brachydactyly [HP:0001156],0/0,0%,2/2,100%,, +Short 5th finger [HP:0009237],0/0,0%,2/2,100%,, +Aplasia/Hypoplasia of the 5th finger [HP:0006262],0/0,0%,2/2,100%,, +Aplasia/Hypoplasia of the middle phalanx of the 5th finger [HP:0009161],0/0,0%,2/2,100%,, +Aplasia/Hypoplasia of the phalanges of the 5th finger [HP:0009376],0/0,0%,2/2,100%,, +Abnormal 5th finger phalanx morphology [HP:0004213],0/0,0%,2/2,100%,, +Abnormality of the middle phalanx of the 5th finger [HP:0004219],0/0,0%,2/2,100%,, +11 pairs of ribs [HP:0000878],0/0,0%,2/2,100%,, +Missing ribs [HP:0000921],0/0,0%,2/2,100%,, +Aplasia/Hypoplasia of the ribs [HP:0006712],0/0,0%,2/2,100%,, +Abnormal rib morphology [HP:0000772],0/0,0%,2/2,100%,, +Bowed forearm bones [HP:0003956],1/1,100%,0/0,0%,, +Bowing of the arm [HP:0006488],1/1,100%,0/0,0%,, +Bowing of the long bones [HP:0006487],1/1,100%,0/0,0%,, +Abnormal diaphysis morphology [HP:0000940],1/1,100%,0/0,0%,, +Abnormal skin morphology [HP:0011121],1/1,100%,0/0,0%,, +Abnormality of the skin [HP:0000951],1/1,100%,0/0,0%,, +Abnormality of the integument [HP:0001574],1/1,100%,0/0,0%,, +Sinus venosus atrial septal defect [HP:0011567],1/1,100%,0/2,0%,, +Left ventricular noncompaction cardiomyopathy [HP:0011664],1/5,20%,1/7,14%,, +Noncompaction cardiomyopathy [HP:0012817],1/1,100%,1/1,100%,, +Cardiomyopathy [HP:0001638],1/1,100%,1/1,100%,, +Abnormal myocardium morphology [HP:0001637],1/1,100%,1/1,100%,, +Abnormal toe morphology [HP:0001780],0/0,0%,1/1,100%,, +Abnormal toe phalanx morphology [HP:0010161],0/0,0%,1/1,100%,, +Abnormality of the distal phalanges of the toes [HP:0010182],0/0,0%,1/1,100%,, +Amelia involving the upper limbs [HP:0009812],1/37,3%,0/114,0%,, +Third degree atrioventricular block [HP:0001709],1/1,100%,0/22,0%,, +Small hypothenar eminence [HP:0010487],0/0,0%,2/2,100%,, +Abnormality of the hypothenar eminence [HP:0010486],0/0,0%,2/2,100%,, +Abnormality of the musculature of the hand [HP:0001421],0/0,0%,2/2,100%,, diff --git a/docs/user-guide/report/tbx5_frameshift.mtc_report.html b/docs/user-guide/report/tbx5_frameshift.mtc_report.html new file mode 100644 index 00000000..3d00300a --- /dev/null +++ b/docs/user-guide/report/tbx5_frameshift.mtc_report.html @@ -0,0 +1,141 @@ + + + + + Cohort + + + + +

Phenotype testing report

+

Phenotype MTC filter: HPO MTC filter

+

Multiple testing correction: fdr_bh

+

Performed statistical tests for 17 out of the total of 260 HPO terms.

+
Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.
Code
TODOSkipping term because all genotypes have same HPO observed proportions42Skipping general term43
TODOSkipping general term41Skipping term because all genotypes have same HPO observed proportions42
3
TODOSkipping non-target term3
TODO
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.
CodeReasonCount
TODOSkipping term with only 2 observations (not powered for 2x2)51
TODOSkipping term because all genotypes have same HPO observed proportions50
TODOSkipping general term44
TODOSkipping term with only 3 observations (not powered for 2x2)27
TODOSkipping term with only 6 observations (not powered for 2x2)19
TODOSkipping term with only 4 observations (not powered for 2x2)16
TODOSkipping term with maximum frequency that was less than threshold 0.210
TODOSkipping term with only 5 observations (not powered for 2x2)10
TODOSkipping term with only 1 observations (not powered for 2x2)7
TODOSkipping term because one genotype had zero observations5
TODOSkipping term because no genotype has more than one observed HPO count4
+ + \ No newline at end of file diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 054763d5..993dd592 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -4,25 +4,32 @@ Statistical tests ================= -There are many different ways of statistically testing for genotype-phenotype correlations, -and the appropriate statistical test depends on the question. -This document provides an overview of the tests offered by the GPSEA library +There are many different ways of statistically testing for genotype-phenotype correlations, +and the appropriate statistical test depends on the question. +This document provides an overview of the tests offered by the GPSEA library and explanations of how they are implemented by our software. +************************************* +Compare genotype and phenotype groups +************************************* + +TODO + +.. _fisher-exact-test: Fisher exact test (FET) -~~~~~~~~~~~~~~~~~~~~~~~ +======================= -The Fisher exact test (FET) calculates the exact probability value -for the relationship between two dichotomous variables. +The Fisher exact test (FET) calculates the exact probability value +for the relationship between two dichotomous variables. In our implementation, the two dichotomous variables are the genotype and the phenotype. -For instance, the individuals of the cohort may be divided -according to whether or not they have a nonsense variant +For instance, the individuals of the cohort may be divided +according to whether or not they have a nonsense variant and according to whether or not they have Strabismus (`HP:0000486 `_). -The results of FET are expressed in terms of an exact probability (P-value), varying within 0 and 1. -Two groups are considered statistically significant if the P-value is less +The results of FET are expressed in terms of an exact probability (P-value), varying within 0 and 1. +Two groups are considered statistically significant if the P-value is less than the chosen significance level (usually :math:`\alpha = 0.05`). The following graphic shows an example contingency table that is used to conduct a Fisher exact test. @@ -32,12 +39,12 @@ We are comparing the frequency of *strabismus* in individuals with missense and :alt: Fisher exact text example :align: center :width: 600px - + To perform the corresponding test in Python, we would use the following code. >>> import scipy.stats as stats >>> contingency_table = [ -... # Missense, Nonsense +... # Missense, Nonsense ... [17, 6 ], # Strabismus observed ... [1, 19 ], # Strabismus excluded ... ] @@ -49,27 +56,213 @@ To perform the corresponding test in Python, we would use the following code. The ``p_value`` evaluates to `5.432292015291845e-06`, meaning there is a significant difference between the groups. -The Fisher exact test evaluates whether the observed frequencies in a contingency table significantly +The Fisher exact test evaluates whether the observed frequencies in a contingency table significantly deviate from the frequencies we would expect if there were no association between the variables. -We want to test whether the frequency of `HP:0000486`` is significantly higher or lower in -one genotype group compared to what would be expected if there were no association. -Note that by default, the *two-tailed* Fisher exact test is performed, meaning we have no -hypothesis as to whether there is a higher or lower frequency in one of the genotype groups. +We want to test whether the frequency of `HP:0000486`` is significantly higher or lower in +one genotype group compared to what would be expected if there were no association. +Note that by default, the *two-tailed* Fisher exact test is performed, meaning we have no +hypothesis as to whether there is a higher or lower frequency in one of the genotype groups. + +However, we are typically interested in testing the associations between the genotype and multiple phenotypic features at once. +GPSEA takes advantage of the HPO structure and simplifies the testing for all HPO terms encoded in the cohort. + +Example +------- + +Let's illustrate this in a real-life example of the analysis of the association between frameshift variants in *TBX5* gene +and congenital heart defects in the dataset of 156 individuals with mutations in *TBX5* whose signs and symptoms were +encoded into HPO terms, stored as phenopackets of the `GA4GH Phenopacket Schema `_, +and deposited in `Phenopacket Store `_ (version `0.1.18`). + +.. note:: + + The shorter version of the same analysis has been presented in the :ref:`tutorial`. + + +Create cohort +^^^^^^^^^^^^^ + +We will load and transform the phenopackets into a :class:`~gpsea.model.Cohort`, +as described in :ref:`input-data` section. Briefly, we will load the phenopackets: + +>>> from ppktstore.registry import configure_phenopacket_registry +>>> registry = configure_phenopacket_registry() +>>> with registry.open_phenopacket_store(release='0.1.18') as ps: +... phenopackets = tuple(ps.iter_cohort_phenopackets('TBX5')) +>>> len(phenopackets) +156 + +followed by loading HPO release `v2024-07-01`: + +>>> import hpotk +>>> store = hpotk.configure_ontology_store() +>>> hpo = store.load_minimal_hpo(release='v2024-07-01') + +and we will perform Q/C and functional annotations for the mutations +with the default cohort creator: + +>>> from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets +>>> cohort_creator = configure_caching_cohort_creator(hpo) +>>> cohort, qc_results = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE +Patients Created: ... +>>> qc_results.summarize() # doctest: +SKIP +Validated under none policy +No errors or warnings were found + +Configure analysis +^^^^^^^^^^^^^^^^^^ + +We want to test the association between frameshift *TBX5* variants and phenotypic abnormalities. +GPSEA exposes a flexible predicate API that lets us create genotype and phenotype predicates +to assign the cohort members into genotype and phenotype categories based on the variants +and the HPO terms. We need to create one genotype predicate and one or more phenotype predicates. + + +**Genotype predicate** + +We want to separate the patients into two groups: a group *with* a frameshift variant +and a group *without* a frameshift variant, based on the functional annotation. +We will use the *MANE* transcript for the analysis: + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates, boolean_predicate +>>> tx_id = 'NM_181486.4' +>>> gt_predicate = boolean_predicate(VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id)) +>>> gt_predicate.get_question() +'FRAMESHIFT_VARIANT on NM_181486.4' + + +**Phenotype predicates** + +We recommend testing the genotype phenotype association for all HPO terms that are present in 2 or more cohort members, +while taking advantage of the HPO graph structure and of the :ref:`true-path-rule`. +We will use the :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest` +utility function to generate phenotype predicates for all HPO terms: + +>>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest +>>> pheno_predicates = prepare_predicates_for_terms_of_interest( +... cohort=cohort, +... hpo=hpo, +... min_n_of_patients_with_term=2, +... ) +>>> len(pheno_predicates) +260 + +The function finds all HPO terms that annotate at least *n* (``min_n_of_patients_with_term=2`` above) individuals, +including the *indirect* annotations whose presence is implied by the true path rule. + + +**Statistical test** + +We will use :ref: to test the association +between genotype and phenotype groups, as described previously. + +>>> from gpsea.analysis.pcats.stats import ScipyFisherExact +>>> count_statistic = ScipyFisherExact() + +FET will compute a p value for each genotype phenotype group. + + +**Multiple testing correction** + +In the case of this cohort, we could test association between having a frameshift variant and one of 260 HPO terms. +However, testing multiple hypotheses on the same dataset increases the risk of finding a significant association +by chance. +GPSEA uses a two-pronged strategy to mitigate this risk - use Phenotype MTC filter and multiple testing correction. + +.. note:: + + See the :ref:`mtc` section for more info on multiple testing procedures. + +Here we will use a combination of the HPO MTC filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`) +with Benjamini-Hochberg procedure (``mtc_correction='fdr_bh'``) +with a false discovery control level set to `0.05` (``mtc_alpha=0.05``): + +>>> from gpsea.analysis.mtc_filter import HpoMtcFilter +>>> mtc_filter = HpoMtcFilter.default_filter(hpo, term_frequency_threshold=0.2) +>>> mtc_correction = 'fdr_bh' +>>> mtc_alpha = 0.05 + + +**Final analysis** + +We finalize the analysis setup by putting all components together +into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`: + +>>> from gpsea.analysis.pcats import HpoTermAnalysis +>>> analysis = HpoTermAnalysis( +... count_statistic=count_statistic, +... mtc_filter=mtc_filter, +... mtc_correction=mtc_correction, +... mtc_alpha=mtc_alpha, +... ) + + +Analysis +^^^^^^^^ + +We can now execute the analysis: + +>>> result = analysis.compare_genotype_vs_phenotypes( +... cohort=cohort, +... gt_predicate=gt_predicate, +... pheno_predicates=pheno_predicates, +... ) +>>> len(result.phenotypes) +260 +>>> result.total_tests +17 + + +Thanks to Phenotype MTC filter, we only tested 16 out of 260 terms. +We can learn more by showing the MTC filter report: + +>>> from gpsea.view import MtcStatsViewer +>>> mtc_viewer = MtcStatsViewer() +>>> mtc_report = mtc_viewer.process(result) +>>> with open('docs/user-guide/report/tbx5_frameshift.mtc_report.html', 'w') as fh: # doctest: +SKIP +... _ = fh.write(mtc_report) + + +.. raw:: html + :file: report/tbx5_frameshift.mtc_report.html + + +Genotype phenotype associations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Last, let's explore the associations. This is a table of the tested HPO terms +ordered by the corrected p value (Benjamini-Hochberg FDR): + +>>> from gpsea.analysis.predicate import PatientCategories +>>> summary_df = result.summarize(hpo, PatientCategories.YES) +>>> summary_df.to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP + +.. csv-table:: *TBX5* frameshift vs rest + :file: report/tbx5_frameshift.csv + :header-rows: 2 .. _phenotype-score-stats: -Mann-Whitney U Test -~~~~~~~~~~~~~~~~~~~ +*************** +Phenotype score +*************** + + +.. _mann-whitney-u-test: + +Mann-Whitney U Test +=================== We may want to compare the total number of occurences of a specific set of phenotypic features between two different genotype groups. -For instance, `Jordan et al (2018) `_ found that the total number of structural defects +For instance, `Jordan et al (2018) `_ found that the total number of structural defects of the brain, eye, heart, and kidney and sensorineural hearing loss seen in individuals with point mutations in the Atrophin-1 domain of the RERE gene is significantly higher than expected based on the number of similar defects seen in individuals with putative loss-of-function variants. -Since there are five potential defects, each individual has a count ranging between 0 and 5. +Since there are five potential defects, each individual has a count ranging between 0 and 5. We perform a Mann-Whitney U Test (or Wilcoxon Rank-Sum Test) to compare the distribution of such counts between genotype groups. -This is a non-parametric test that compares the medians of the two groups to determine if they come from the same distribution. +This is a non-parametric test that compares the medians of the two groups to determine if they come from the same distribution. >>> import scipy.stats as stats >>> group1 = [0, 0, 1, 0, 2, 0, 1, 1, 1, 0, 2, 0, 0, 3, 1, 1, 1, 0] @@ -79,14 +272,20 @@ This is a non-parametric test that compares the medians of the two groups to det >>> float(p_value) 6.348081479150902e-06 + ``p_value`` evaluates to `6.348081479150901e-06`, meaning there is a significant difference between the groups. Example -^^^^^^^ +------- + +Let's now analyze the subjects reported in *Jordan et al*. +We will load 19 phenopackets that represent individuals with mutations in *RERE* +whose signs and symptoms were encoded into HPO terms and deposited into Phenopacket Store. +The phenopackets will be processed into :class:`~gpsea.model.Cohort` +as described in the :ref:`input-data` section. -Let's now analyze the subjects reported in *Jordan et al*. -We start by loading the cohort from Phenopacket Store (version `0.1.18`): +Briefly, we will first load 19 phenopackets >>> from ppktstore.registry import configure_phenopacket_registry >>> registry = configure_phenopacket_registry() @@ -95,37 +294,55 @@ We start by loading the cohort from Phenopacket Store (version `0.1.18`): >>> len(phenopackets) 19 -We loaded 19 phenopackets. -Now, we need to prepare the phenopackets for using with GPSEA. -We will need HPO (version `v2024-07-01`) +and load HPO (version `v2024-07-01`) >>> import hpotk >>> store = hpotk.configure_ontology_store() >>> hpo = store.load_minimal_hpo(release='v2024-07-01') -to create cohort creator + +to create a :class:`~gpsea.preprocessing.CohortCreator` >>> from gpsea.preprocessing import configure_caching_cohort_creator >>> cohort_creator = configure_caching_cohort_creator(hpo) + which we will use to preprocess the cohort >>> from gpsea.preprocessing import load_phenopackets >>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE Patients Created: ... + + +resulting in a cohort consisting of 19 individuals + >>> len(cohort) 19 -Now we can set up the phenotype and genotype predicates. Jordan et al tests ... -.. todo: improve the text +Configure analysis +^^^^^^^^^^^^^^^^^^ + +Now we can set up the analysis of genotype and phenotype. +We will perform the analysis using the *RERE* transcript selected +as the "main" biologically relevant by the `MANE` consortium. ->>> rere_mane_tx_id = 'NM_001042681.2' +>>> tx_id = 'NM_001042681.2' -Now let's create a predicate for testing if the variant is a point mutation or a loss of function mutation. -The point mutation predicate is defined as ... -TODO: improve! + +**Genotype predicate** + +*Jordan et al.* compare phenotype of individuals harboring point mutations +with the individuals carrying loss of function mutations. +Let's create a predicate for testing if the variant +is a point mutation or a loss of function mutation. + +In this example, the point mutation is a mutation that meets the following conditions: + +* predicted to lead to a missense variant on the `MANE` transcript +* the :ref:`length-of-the-reference-allele` is equal to `1` +* the :ref:`change-length-of-an-allele` is equal to `0` >>> from gpsea.model import VariantEffect >>> from gpsea.analysis.predicate.genotype import VariantPredicates @@ -134,11 +351,12 @@ TODO: improve! ... ) >>> point_mutation = VariantPredicates.change_length('==', 0) \ ... & VariantPredicates.ref_length('==', 1) \ -... & VariantPredicates.any(VariantPredicates.variant_effect(effect, rere_mane_tx_id) for effect in point_mutation_effects) +... & VariantPredicates.any(VariantPredicates.variant_effect(effect, tx_id) for effect in point_mutation_effects) >>> point_mutation.get_question() '((change length == 0 AND ref allele length == 1) AND MISSENSE_VARIANT on NM_001042681.2)' -For the loss of function predicate, these variant effects are considered loss of function: + +For the loss of function predicate, the following variant effects are considered loss of function: >>> lof_effects = ( ... VariantEffect.TRANSCRIPT_ABLATION, @@ -146,10 +364,11 @@ For the loss of function predicate, these variant effects are considered loss of ... VariantEffect.START_LOST, ... VariantEffect.STOP_GAINED, ... ) ->>> lof_mutation = VariantPredicates.any(VariantPredicates.variant_effect(eff, rere_mane_tx_id) for eff in lof_effects) +>>> lof_mutation = VariantPredicates.any(VariantPredicates.variant_effect(eff, tx_id) for eff in lof_effects) >>> lof_mutation.get_question() '(TRANSCRIPT_ABLATION on NM_001042681.2 OR FRAMESHIFT_VARIANT on NM_001042681.2 OR START_LOST on NM_001042681.2 OR STOP_GAINED on NM_001042681.2)' + The genotype predicate will bin the patient into two groups: a point mutation group or the loss of function group: >>> from gpsea.analysis.predicate.genotype import groups_predicate @@ -160,8 +379,24 @@ The genotype predicate will bin the patient into two groups: a point mutation gr >>> gt_predicate.get_question() 'Genotype group: Point, LoF' -Now phenotype predicate. The authors divide the patients into groups according to the count of structural defects -in these groups: + +**Phenotype score** + +The authors score the individuals based on the number of structural defects +from the following 5 categories: + +* Brain anomalies +* Eye anomalies +* Congenital heart defects +* Renal anomalies +* Sensorineural hearing loss + +and they assign each individual a score based on the number structural defects. +For example, an individual with a congenital heart defect would be assigned a score of `1`, +an individual with congenital heart defect and a renal anomaly would be assigned a score of `2`, +and so on. + +We automatize this scoring method by encoding the categories into HPO terms >>> structural_defects = ( ... 'HP:0012443', # Abnormal brain morphology (Brain anomalies) @@ -171,58 +406,109 @@ in these groups: ... 'HP:0000407', # Sensorineural hearing impairment (Sensorineural hearing loss) ... ) -Let's run the analysis. ->>> from gpsea.analysis import configure_cohort_analysis ->>> analysis = configure_cohort_analysis( -... cohort, hpo, +and then test the individuals for presence of at least one HPO term +that corresponds to the structural defect category +(e.g. `Abnormal brain morphology `_) +or that is its descendant +(e.g. `Cerebellar atrophy `_). + +GPSEA implements this scoring method in :class:`~gpsea.analysis.pscore.CountingPhenotypeScorer`. + +>>> from gpsea.analysis.pscore import CountingPhenotypeScorer +>>> pheno_scorer = CountingPhenotypeScorer.from_query_curies( +... hpo=hpo, +... query=structural_defects, ... ) ->>> result = analysis.compare_genotype_vs_phenotype_group_count( -... gt_predicate=gt_predicate, -... phenotype_group_terms=structural_defects, + + +**Statistical test** + +We will use :ref:`mann-whitney-u-test` as described above. + +>>> from gpsea.analysis.pscore.stats import MannWhitneyStatistic +>>> score_statistic = MannWhitneyStatistic() + + +**Final analysis** + +We will put the final analysis together into :class:`~gpsea.analysis.pscore.PhenotypeScoreAnalysis`. + +>>> from gpsea.analysis.pscore import PhenotypeScoreAnalysis +>>> score_analysis = PhenotypeScoreAnalysis( +... score_statistic=score_statistic, ... ) ->>> round(result.p_value, 9) -0.027066902 -We have the counts: +Analysis +^^^^^^^^ + +We execute the analysis by running ->>> counts = result.genotype_phenotype_scores ->>> counts.head() # doctest: +NORMALIZE_WHITESPACE +>>> result = score_analysis.compare_genotype_vs_phenotype_score( +... cohort=cohort, +... gt_predicate=gt_predicate, +... pheno_scorer=pheno_scorer, +... ) + + +The analysis shows a significant difference between the number of structural defects +in individuals with point vs. loss-of-function mutations. + +>>> result.pval +0.012074957610483744 + + +To explore further, we can access a data frame with genotype categories and phenotype counts: + +>>> scores = result.genotype_phenotype_scores.sort_index() +>>> scores.head() # doctest: +NORMALIZE_WHITESPACE genotype phenotype -patient_id +patient_id Subject 10[PMID_27087320_Subject_10] 1 0 Subject 1[PMID_27087320_Subject_1] 0 4 +Subject 1[PMID_29330883_Subject_1] 1 0 Subject 2[PMID_27087320_Subject_2] None 4 Subject 2[PMID_29330883_Subject_2] 1 1 -Subject 3[PMID_27087320_Subject_3] 0 4 -The data frame provides a genotype category and a phenotype score for each patient. + +The data frame provides a `genotype` category and a `phenotype` score for each patient. The genotype category should be interpreted in the context of the genotype predicate: >>> gt_id_to_name = {c.category.cat_id: c.category.name for c in gt_predicate.get_categorizations()} >>> gt_id_to_name {0: 'Point', 1: 'LoF'} -Let's plot the data: + +The genotype code `0` is assigned to patients with a point mutation, `1` corresponds to the loss-of-function mutations, +and `None` is assigned to patients who cannot be assigned into any of the groups. + +Last, let's use :meth:`~gpsea.analysis.pscore.PhenotypeScoreAnalysisResult.plot_boxplots` method +to show a box plot of the phenotype score distributions: >>> import matplotlib.pyplot as plt >>> fig, ax = plt.subplots(figsize=(6, 4), dpi=120) ->>> data = counts.loc[counts['genotype'].notna()] # skip the patients with unassigned genotype group ->>> x = [data.loc[data['genotype'] == c.category.cat_id, 'phenotype'].to_list() for c in gt_predicate.get_categorizations()] ->>> gt_cat_labels = [gt_id_to_name[c.category.cat_id] for c in gt_predicate.get_categorizations()] ->>> bplot = ax.boxplot( -... x=x, -... patch_artist=True, tick_labels=gt_cat_labels, +>>> result.plot_boxplots( +... gt_predicate=gt_predicate, +... ax=ax, +... ) +>>> _ = ax.grid(axis="y") +>>> _ = ax.set( +... ylabel="Phenotype score", ylim=(-0.5, len(structural_defects) + 0.5) ... ) ->>> _ = ax.grid(axis='y') ->>> _ = ax.set(ylabel='Phenotype group count', ylim=(-.5, len(structural_defects) + .5)) ->>> for patch, color in zip(bplot['boxes'], ['darksalmon', 'honeydew']): -... patch.set_facecolor(color) ->>> fig.savefig('docs/img/phenotype_group_counts.png') # doctest: +SKIP +>>> fig.savefig('docs/img/rere_phenotype_score_boxplot.png') # doctest: +SKIP -.. image:: /img/phenotype_group_counts.png - :alt: Phenotype group counts +.. image:: /img/rere_phenotype_score_boxplot.png + :alt: Phenotype score distribution :align: center :width: 600px + + +We see that the individuals with the point mutations feature structural defects +than the individuals with the loss-of-function mutations. + +The box extends from the first quartile (Q1) to the third quartile (Q3) of the data, +with a red line at the median. +The whiskers extend from the box to the farthest data point +lying within 1.5x the inter-quartile range (IQR) from the box. diff --git a/src/gpsea/analysis/__init__.py b/src/gpsea/analysis/__init__.py index 0b4f157b..1adf3c2e 100644 --- a/src/gpsea/analysis/__init__.py +++ b/src/gpsea/analysis/__init__.py @@ -1,9 +1,7 @@ -from . import predicate - from ._api import CohortAnalysis, GenotypePhenotypeAnalysisResult, HpoMtcReport +# TODO This should go away from ._config import CohortAnalysisConfiguration, configure_cohort_analysis, configure_default_protein_metadata_service, MtcStrategy from ._gp_analysis import apply_predicates_on_patients -from ._mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter from ._util import prepare_hpo_terms_of_interest __all__ = [ @@ -11,7 +9,6 @@ 'CohortAnalysis', 'GenotypePhenotypeAnalysisResult', 'CohortAnalysisConfiguration', 'MtcStrategy', 'HpoMtcReport', - 'PhenotypeMtcFilter', 'UseAllTermsMtcFilter', 'SpecifiedTermsMtcFilter', 'HpoMtcFilter', 'apply_predicates_on_patients', 'configure_default_protein_metadata_service', 'prepare_hpo_terms_of_interest', diff --git a/src/gpsea/analysis/_api.py b/src/gpsea/analysis/_api.py index 5555e225..ca7bd8e4 100644 --- a/src/gpsea/analysis/_api.py +++ b/src/gpsea/analysis/_api.py @@ -7,9 +7,10 @@ from gpsea.model import Patient from gpsea.preprocessing import ProteinMetadataService -from .predicate import PolyPredicate, GenotypePolyPredicate, PatientCategory -from .predicate.genotype import VariantPredicate, ProteinPredicates -from .predicate.phenotype import P, PhenotypePolyPredicate, CountingPhenotypeScorer +from .predicate import PolyPredicate, PatientCategory +from .predicate.genotype import GenotypePolyPredicate, VariantPredicate, ProteinPredicates +from .predicate.phenotype import P, PhenotypePolyPredicate +from .pscore import PhenotypeScorer, CountingPhenotypeScorer PatientsByHPO = namedtuple('PatientsByHPO', field_names=['all_with_hpo', 'all_without_hpo']) @@ -18,6 +19,7 @@ class HpoMtcReport: """ Class to simplify reporting results of multiple testing filtering by HpoMtcFilter subclasses. """ + # TODO: delete with no replacement. def __init__( self, @@ -74,6 +76,7 @@ class GenotypePhenotypeAnalysisResult: """ `GenotypePhenotypeAnalysisResult` summarizes results of genotype-phenotype correlation analysis of a cohort. """ + # TODO: delete and use `gpsea.analysis.pcats.MultiPhenotypeAnalysisResult`. def __init__( self, @@ -142,7 +145,7 @@ def phenotype_categories(self) -> typing.Sequence[PatientCategory]: Get a sequence of phenotype patient categories that can be investigated. """ return self._phenotype_categories - + @property def total_tests(self) -> int: """ @@ -239,6 +242,7 @@ class PhenotypeScoreAnalysisResult: See :ref:`Mann Whitney U Test for phenotype score ` for more background. """ + # TODO: delete and use `gpsea.analysis.pscore.PhenotypeScoreAnalysisResult` def __init__( self, @@ -292,6 +296,7 @@ class CohortAnalysis(metaclass=abc.ABCMeta): The class provides various methods to test genotype-phenotype correlations. All methods wrap results into :class:`GenotypePhenotypeAnalysisResult`. """ + # TODO: remove and use the analyses described in `User Guide > Statistical tests`. def __init__( self, @@ -308,7 +313,7 @@ def compare_hpo_vs_genotype( predicate: VariantPredicate, ) -> GenotypePhenotypeAnalysisResult: """ - Bin patients according to a presence of at least one allele that matches `predicate` + Bin patients according to a presence of at least one allele that matches `predicate` and test for genotype-phenotype correlations. """ pass @@ -352,6 +357,7 @@ def compare_genotype_vs_phenotype_group_count( gt_predicate: GenotypePolyPredicate, phenotype_group_terms: typing.Iterable[typing.Union[str, hpotk.TermId]], ) -> PhenotypeScoreAnalysisResult: + # TODO: separate into pscore module assert isinstance(gt_predicate, GenotypePolyPredicate) assert gt_predicate.n_categorizations() == 2 @@ -369,7 +375,7 @@ def compare_genotype_vs_phenotype_group_count( def compare_genotype_vs_phenotype_score( self, gt_predicate: GenotypePolyPredicate, - phenotype_scorer: typing.Callable[[Patient,], float], + phenotype_scorer: PhenotypeScorer, ) -> PhenotypeScoreAnalysisResult: """ Score the patients with a phenotype scoring method and test for correlation between the genotype group @@ -386,8 +392,6 @@ def compare_genotype_vs_cohort_phenotypes( self, gt_predicate: GenotypePolyPredicate, ) -> GenotypePhenotypeAnalysisResult: - # TODO: if we had access to the cohort, it would be trivial to prepare - # all phenotype predicates and to call `self.compare_genotype_vs_phenotypes`. pass @abc.abstractmethod diff --git a/src/gpsea/analysis/_config.py b/src/gpsea/analysis/_config.py index cbbed148..3fb67fc7 100644 --- a/src/gpsea/analysis/_config.py +++ b/src/gpsea/analysis/_config.py @@ -8,8 +8,8 @@ from gpsea.model import Cohort from gpsea.preprocessing import ProteinMetadataService, UniprotProteinMetadataService, ProteinAnnotationCache, \ ProtCachingMetadataService +from .mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter from ._api import CohortAnalysis -from ._mtc_filter import PhenotypeMtcFilter, UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter from ._gp_analysis import FisherExactAnalyzer from ._gp_impl import GpCohortAnalysis diff --git a/src/gpsea/analysis/_gp_analysis.py b/src/gpsea/analysis/_gp_analysis.py index 63a43f18..35d14305 100644 --- a/src/gpsea/analysis/_gp_analysis.py +++ b/src/gpsea/analysis/_gp_analysis.py @@ -6,11 +6,12 @@ from statsmodels.stats import multitest from gpsea.model import Patient +from .mtc_filter import PhenotypeMtcFilter +from .predicate import PatientCategory +from .predicate.genotype import GenotypePolyPredicate +from .predicate.phenotype import PhenotypePolyPredicate, P from ._api import GenotypePhenotypeAnalysisResult, HpoMtcReport -from ._mtc_filter import PhenotypeMtcFilter from ._stats import run_fisher_exact, run_recessive_fisher_exact -from .predicate import GenotypePolyPredicate, PatientCategory -from .predicate.phenotype import PhenotypePolyPredicate, P def apply_predicates_on_patients( @@ -44,33 +45,31 @@ def apply_predicates_on_patients( - a mapping from phenotype :class:`P` to a data frame with counts of patients in i-th phenotype category and j-th genotype category where i and j are rows and columns of the data frame """ + # TODO: delete with no replacement. phenotypes = set() categories = set() for predicate in pheno_predicates: categories.update(predicate.get_categories()) - phenotypes.update(predicate.phenotypes) + phenotypes.add(predicate.phenotype) n_usable_patients = pd.Series(data=0, index=pd.Index(phenotypes)) # Apply genotype and phenotype predicates counts = {} for ph_predicate in pheno_predicates: - phenotypes = ph_predicate.phenotypes - - for phenotype in phenotypes: - if phenotype not in counts: - # Make an empty frame for keeping track of the counts. - counts[phenotype] = pd.DataFrame( - data=0, - index=pd.Index( - data=ph_predicate.get_categories(), - name=ph_predicate.get_question(), - ), - columns=pd.Index( - data=gt_predicate.get_categories(), - name=gt_predicate.get_question(), - ), - ) + if ph_predicate.phenotype not in counts: + # Make an empty frame for keeping track of the counts. + counts[ph_predicate.phenotype] = pd.DataFrame( + data=0, + index=pd.Index( + data=ph_predicate.get_categories(), + name=ph_predicate.get_question(), + ), + columns=pd.Index( + data=gt_predicate.get_categories(), + name=gt_predicate.get_question(), + ), + ) for patient in patients: pheno_cat = ph_predicate.test(patient) @@ -87,6 +86,7 @@ class GPAnalyzer(typing.Generic[P], metaclass=abc.ABCMeta): """ `GPAnalyzer` calculates p values for genotype-phenotype correlation of phenotypic features of interest. """ + # TODO: delete with no replacement. @abc.abstractmethod def analyze( @@ -115,6 +115,7 @@ class FisherExactAnalyzer(typing.Generic[P], GPAnalyzer[P]): should be applied. :param mtc_alpha: a `float` in range :math:`(0, 1]` with the multiple testing correction alpha value. """ + # TODO: delete and use `gpsea.analysis.pcats.MultiPhenotypeAnalysis`. def __init__( self, diff --git a/src/gpsea/analysis/_gp_impl.py b/src/gpsea/analysis/_gp_impl.py index a35c6b96..c6251f8b 100644 --- a/src/gpsea/analysis/_gp_impl.py +++ b/src/gpsea/analysis/_gp_impl.py @@ -6,10 +6,10 @@ from scipy.stats import mannwhitneyu -from gpsea.model import Cohort, Patient +from gpsea.model import Cohort from gpsea.preprocessing import ProteinMetadataService -from .predicate import GenotypePolyPredicate -from .predicate.genotype import VariantPredicate +from .pscore import PhenotypeScorer +from .predicate.genotype import GenotypePolyPredicate, VariantPredicate from .predicate.genotype import boolean_predicate as wrap_as_boolean_predicate from .predicate.genotype import groups_predicate as wrap_as_groups_predicate from .predicate.genotype import recessive_predicate as wrap_as_recessive_predicate @@ -131,7 +131,7 @@ def _prepare_disease_predicates( def compare_genotype_vs_phenotype_score( self, gt_predicate: GenotypePolyPredicate, - phenotype_scorer: typing.Callable[[Patient,], float], + phenotype_scorer: PhenotypeScorer, ) -> PhenotypeScoreAnalysisResult: idx = pd.Index(tuple(p.patient_id for p in self._patient_list), name='patient_id') data = pd.DataFrame( @@ -144,7 +144,7 @@ def compare_genotype_vs_phenotype_score( for patient in self._patient_list: gt_cat = gt_predicate.test(patient) data.loc[patient.patient_id, 'genotype'] = None if gt_cat is None else gt_cat.category.cat_id - data.loc[patient.patient_id, 'phenotype'] = phenotype_scorer(patient) + data.loc[patient.patient_id, 'phenotype'] = phenotype_scorer.score(patient) # To improve the determinism data.sort_index(inplace=True) diff --git a/src/gpsea/analysis/_inheritance.py b/src/gpsea/analysis/_inheritance.py deleted file mode 100644 index 5c0c4236..00000000 --- a/src/gpsea/analysis/_inheritance.py +++ /dev/null @@ -1,29 +0,0 @@ -import abc -import typing - -from gpsea.model import Variant - - -class InheritanceCompatibilityChecker(metaclass=abc.ABCMeta): - - @abc.abstractmethod - def check_compatibility(self, patient: typing.Sequence[Variant]) -> bool: - pass - - -class AutosomalDominantChecker(InheritanceCompatibilityChecker): - - def check_compatibility(self, patient: typing.Sequence[Variant]) -> bool: - pass - - -class AutosomalRecessiveChecker(InheritanceCompatibilityChecker): - - def check_compatibility(self, patient: typing.Sequence[Variant]) -> bool: - pass - - -class MitochondrialChecker(InheritanceCompatibilityChecker): - - def check_compatibility(self, patient: typing.Sequence[Variant]) -> bool: - pass diff --git a/src/gpsea/analysis/_stats.py b/src/gpsea/analysis/_stats.py index f2a480b9..1364e7d5 100644 --- a/src/gpsea/analysis/_stats.py +++ b/src/gpsea/analysis/_stats.py @@ -6,6 +6,8 @@ import numpy as np import scipy +# TODO: delete the module + class MultiFisherExact(metaclass=abc.ABCMeta): diff --git a/src/gpsea/analysis/_test_fisherExact.py b/src/gpsea/analysis/_test_fisherExact.py index aba6ff26..fd530f5c 100644 --- a/src/gpsea/analysis/_test_fisherExact.py +++ b/src/gpsea/analysis/_test_fisherExact.py @@ -8,6 +8,8 @@ def MultiExact() -> PythonMultiFisherExact: return PythonMultiFisherExact() +# TODO: remove + @pytest.mark.parametrize('table, raise_error, pVal', ([[[0,0,0],[0,0,0]], pytest.raises(ValueError), None], [[[2, 1, 0],[3, 0, 2]], does_not_raise(), 0.6429], diff --git a/src/gpsea/analysis/_util.py b/src/gpsea/analysis/_util.py index 1e0bdc77..2c3445b5 100644 --- a/src/gpsea/analysis/_util.py +++ b/src/gpsea/analysis/_util.py @@ -24,6 +24,7 @@ def prepare_hpo_terms_of_interest( :param min_n_of_patients_with_term: the minimum number of patients that must feature an HPO term (either directly or indirectly) for the term to be included in the analysis. """ + # TODO remove in favor of `gpsea.analysis.predicate.phenotype` present_count = Counter() excluded_count = Counter() diff --git a/src/gpsea/analysis/mtc_filter/__init__.py b/src/gpsea/analysis/mtc_filter/__init__.py new file mode 100644 index 00000000..4497ac34 --- /dev/null +++ b/src/gpsea/analysis/mtc_filter/__init__.py @@ -0,0 +1,14 @@ +""" +The `gpsea.analysis.mtc_filter` provides the strategies for reducing multiple testing burden +by pre-filtering the phenotype terms to test in advance. + +See :ref:`MTC filters ` section for more info. +""" + +from ._impl import PhenotypeMtcFilter, PhenotypeMtcResult +from ._impl import UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter + +__all__ = [ + 'PhenotypeMtcFilter', 'PhenotypeMtcResult', + 'UseAllTermsMtcFilter', 'SpecifiedTermsMtcFilter', 'HpoMtcFilter', +] diff --git a/src/gpsea/analysis/_mtc_filter.py b/src/gpsea/analysis/mtc_filter/_impl.py similarity index 68% rename from src/gpsea/analysis/_mtc_filter.py rename to src/gpsea/analysis/mtc_filter/_impl.py index 2025c0ed..b15e4bc6 100644 --- a/src/gpsea/analysis/_mtc_filter.py +++ b/src/gpsea/analysis/mtc_filter/_impl.py @@ -6,10 +6,62 @@ import hpotk import pandas as pd -from .predicate import PatientCategories, PatientCategory, GenotypePolyPredicate +from ..predicate import PatientCategories, PatientCategory +from ..predicate.genotype import GenotypePolyPredicate +from ..predicate.phenotype import P -class PhenotypeMtcFilter(metaclass=abc.ABCMeta): +class PhenotypeMtcResult: + """ + `PhenotypeMtcResult` represents a result of :class:`PhenotypeMtcFilter` for a single phenotype. + + The phenotype can either pass the filter, in order to be included in the downstream analysis (:meth:`is_passed`) + of be filtered out (:meth:`is_filtered_out`) in which case :attr:`reason` must be available. + """ + + @staticmethod + def ok() -> "PhenotypeMtcResult": + # A singleton would be nice here... + return PhenotypeMtcResult(status=True, reason=None) + + @staticmethod + def fail(reason: str) -> "PhenotypeMtcResult": + return PhenotypeMtcResult(status=False, reason=reason) + + def __init__( + self, + status: bool, + reason: typing.Optional[str], + ): + self._status = status + self._reason = reason + + def is_passed(self) -> bool: + return self._status + + def is_filtered_out(self) -> bool: + return not self._status + + @property + def reason(self) -> typing.Optional[str]: + return self._reason + + def __eq__(self, value: object) -> bool: + return isinstance(value, PhenotypeMtcResult) \ + and self._status == value._status \ + and self._reason == value._reason + + def __hash__(self) -> int: + return hash((self._status, self._reason)) + + def __str__(self) -> str: + return f'PhenotypeMtcResult(status={self._status}, reason={self._reason})' + + def __repr__(self) -> str: + return str(self) + + +class PhenotypeMtcFilter(typing.Generic[P], metaclass=abc.ABCMeta): """ `PhenotypeMtcFilter` decides which phenotypes should be tested and which phenotypes are not worth testing in order to reduce the multiple testing burden. @@ -20,6 +72,24 @@ class PhenotypeMtcFilter(metaclass=abc.ABCMeta): categorized according to the HPO term. """ + @abc.abstractmethod + def filter( + self, + gt_predicate: GenotypePolyPredicate, + phenotypes: typing.Sequence[P], + counts: typing.Sequence[pd.DataFrame], + ) -> typing.Sequence[PhenotypeMtcResult]: + """ + Test if the phenotype with given counts should be included in the downstream analysis. + + :param gt_predicate: the predicate that produced the columns of the `count` data frame. + :param phenotypes: the tested phenotypes. + :param counts: a sequence of 2D data frames for the tested phenotypes. + Each data frame corrresponds to a genotype/phenotype contingency matrix. + :returns: a sequence of filter results for the input `phenotypes`. + """ + pass + @abc.abstractmethod def filter_terms_to_test( self, @@ -58,13 +128,25 @@ def filter_method_name(self) -> str: pass -class UseAllTermsMtcFilter(PhenotypeMtcFilter): +class UseAllTermsMtcFilter(PhenotypeMtcFilter[typing.Any]): """ `UseAllTermsMtcFilter` filters out *no* phenotype terms. See :ref:`use-all-terms-strategy` section for more info. """ + def __init__(self): + self._ok = PhenotypeMtcResult.ok() + + def filter( + self, + gt_predicate: GenotypePolyPredicate, + phenotypes: typing.Sequence[P], + counts: typing.Sequence[pd.DataFrame], + ) -> typing.Sequence[PhenotypeMtcResult]: + # Always OK! 😏 + return tuple(self._ok for _ in phenotypes) + def filter_terms_to_test( self, gt_predicate: GenotypePolyPredicate, @@ -85,7 +167,7 @@ def filter_method_name(self) -> str: return "All HPO terms" -class SpecifiedTermsMtcFilter(PhenotypeMtcFilter): +class SpecifiedTermsMtcFilter(PhenotypeMtcFilter[hpotk.TermId]): """ `SpecifiedTermsMtcFilter` limits the HPO terms to be tested to a selection of provided terms. @@ -108,9 +190,25 @@ def __init__( hpo: reference to HPO ontology object terms_to_test: an iterable of TermIds representing the terms to test """ + self._ok = PhenotypeMtcResult.ok() + self._fail = PhenotypeMtcResult.fail("Non-specified term") self._hpo = hpo self._terms_to_test_set = set(terms_to_test) + def filter( + self, + gt_predicate: GenotypePolyPredicate, + phenotypes: typing.Sequence[P], + counts: typing.Sequence[pd.DataFrame], + ) -> typing.Sequence[PhenotypeMtcResult]: + results = [] + for phenotype in phenotypes: + if phenotype in self._terms_to_test_set: + results.append(self._ok) + else: + results.append(self._fail) + return tuple(results) + def filter_terms_to_test( self, gt_predicate: GenotypePolyPredicate, @@ -153,12 +251,14 @@ def filter_method_name(self) -> str: return "Specified terms MTC filter" -class HpoMtcFilter(PhenotypeMtcFilter): +class HpoMtcFilter(PhenotypeMtcFilter[hpotk.TermId]): """ `HpoMtcFilter` decides which phenotypes should be tested and which phenotypes are not worth testing. The class leverages a number of heuristics and domain decisions. See :ref:`hpo-mtc-filter-strategy` section for more info. + + We recommend creating an instance using the :func:`default_filter` static factory method. """ # TODO: this has been here before. Is it still valid? @@ -176,6 +276,7 @@ class HpoMtcFilter(PhenotypeMtcFilter): def default_filter( hpo: hpotk.MinimalOntology, term_frequency_threshold: float, + phenotypic_abnormality: hpotk.TermId = hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY, ): """ Args: @@ -184,21 +285,25 @@ def default_filter( for an HPO term to have in at least one of the genotype groups (e.g., 22% in missense and 3% in nonsense genotypes would be OK, but not 13% missense and 10% nonsense genotypes if the threshold is 0.2) + phenotypic_abnormality: a :class:`~hpotk.TermId` corresponding to the root of HPO phenotype hierarchy. + Having to specify this option should be very rarely, if ever. """ - # ## Collect sets of top-level and second-level terms + # The very top of the ontology is completely uninteresting. + # Knowing about association between `All`, or `Phenotypic abnormality` tells us very little + top_level_terms = {phenotypic_abnormality, } + top_level_terms.update(hpo.graph.get_ancestors(phenotypic_abnormality)) + + # Collect sets of the 1st level terms # e.g., Abnormality of the cardiovascular system (HP:0001626) - top_level_terms = set( - hpo.graph.get_children(hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY) + first_level_terms = set( + hpo.graph.get_children(phenotypic_abnormality) ) - # e.g. Abnormality of the vasculature (HP:0002597) - second_level_terms = set() - third_level_terms = set() - # the second level (children of the top level), contains terms we want to keep, # such as HP:0001611 Hypernasal speech, but it also contains "general" terms that # we skip according to this heuristic, e.g., HP:0030680 Abnormal cardiovascular system morphology - for t1 in top_level_terms: + second_level_terms = set() + for t1 in first_level_terms: for t2 in hpo.graph.get_children(t1): label = hpo.get_term_name(t2) if label is not None and label.startswith("Abnormal"): @@ -207,6 +312,7 @@ def default_filter( # The third level (children of the second level), contains terms we want to keep, # such as HP:0031109 Agalactia, but it also contains "general" terms # that we skip according to this heuristic, e.g., HP:0006500 Abnormal lower limb epiphysis morphology + third_level_terms = set() for t2 in second_level_terms: for t3 in hpo.graph.get_children(t2): label = hpo.get_term_name(t3) @@ -217,6 +323,7 @@ def default_filter( # In the future, one could do this on a per-level basis. general_hpo_term_set = set() general_hpo_term_set.update(top_level_terms) + general_hpo_term_set.update(first_level_terms) general_hpo_term_set.update(second_level_terms) general_hpo_term_set.update(third_level_terms) @@ -248,6 +355,107 @@ def __init__( # thus if the total count is 6 or less, there is no power - CAN WE SIMPLIFY? self._general_hpo_terms = set(general_hpo_terms) + self._ok = PhenotypeMtcResult.ok() + + def filter( + self, + gt_predicate: GenotypePolyPredicate, + phenotypes: typing.Sequence[hpotk.TermId], + counts: typing.Sequence[pd.DataFrame], + ) -> typing.Sequence[PhenotypeMtcResult]: + p_to_idx = {p: i for i, p in enumerate(phenotypes)} + + results = [None for _ in range(len(phenotypes))] + categories = tuple(gt_predicate.get_categories()) + for term_id in self._get_ordered_terms(phenotypes): + try: + idx = p_to_idx[term_id] + except KeyError: + # The procedure for getting the ordered terms produced a term + # that is not included in `phenotypes`. + # This is normal and we just move on. + continue + + if term_id in self._general_hpo_terms: + results[idx] = PhenotypeMtcResult.fail("Skipping general term") + continue + + if not self._hpo.graph.is_ancestor_of( + hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY, term_id + ): + results[idx] = PhenotypeMtcResult.fail("Skipping non phenotype term") + continue + + # get total number of observations + # if term_id not in all_counts: + # reason_for_filtering_out["Skipping non-target term"] += 1 + # continue + + contingency_matrix = counts[idx] + total = contingency_matrix.sum().sum() + max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( + contingency_matrix + ) + if max_freq < self._hpo_term_frequency_filter: + reason = ( + "Skipping term with maximum frequency " + f"that was less than threshold {self._hpo_term_frequency_filter}" + ) + results[idx] = PhenotypeMtcResult.fail(reason) + continue + + if contingency_matrix.shape == (2, 2) and total < 7: + reason = f"Skipping term with only {total} observations (not powered for 2x2)" + results[idx] = PhenotypeMtcResult.fail(reason) + continue + + # todo -- similar for (3,2) + if not HpoMtcFilter.some_cell_has_greater_than_one_count(contingency_matrix): + reason = "Skipping term because no genotype has more than one observed HPO count" + results[idx] = PhenotypeMtcResult.fail(reason) + continue + + elif HpoMtcFilter.genotypes_have_same_hpo_proportions( + contingency_matrix, + categories, + ): + reason = "Skipping term because all genotypes have same HPO observed proportions" + results[idx] = PhenotypeMtcResult.fail(reason) + continue + + elif HpoMtcFilter.one_genotype_has_zero_hpo_observations( + contingency_matrix, + categories, + ): + reason = "Skipping term because one genotype had zero observations" + results[idx] = PhenotypeMtcResult.fail(reason) + continue + + # TODO: the code below actually did not work. + # for child_term_id in self._hpo.graph.get_children(term_id): + # if child_term_id in tested_counts_pf: + # if tested_counts_pf[child_term_id].equals(contingency_matrix): + # # TODO: should we make the match fuzzier by adding a tolerance instead of the exact matches? + # reason = "Child term with same counts previously tested" + # reason_for_filtering_out[reason] += 1 + # continue + + # The term should be tested if we get here. + results[idx] = self._ok + + # There should be no `None` elements in `results` at this point. + if any(r is None for r in results): + missing = [] + for i, result in enumerate(results): + if result is None: + term_name = self._hpo.get_term_name(phenotypes[i]) + missing.append(term_name) + + print('BLAAAAAA') + msg = 'Missing results for {}'.format(', '.join(missing)) + raise ValueError(msg) + + return tuple(results) def filter_terms_to_test( self, @@ -275,6 +483,7 @@ def filter_terms_to_test( ) # key is an HP id, value is a tuple with counts, i.e., # iterate through the terms in the sorted order, starting from the leaves of the induced graph. + categories = tuple(gt_predicate.get_categories()) for term_id in self._get_ordered_terms(all_counts.keys()): if term_id in self._general_hpo_terms: reason_for_filtering_out["Skipping general term"] += 1 @@ -317,7 +526,7 @@ def filter_terms_to_test( elif HpoMtcFilter.genotypes_have_same_hpo_proportions( counts_frame, - gt_predicate.get_categories(), + categories, ): reason = "Skipping term because all genotypes have same HPO observed proportions" reason_for_filtering_out[reason] += 1 @@ -325,7 +534,7 @@ def filter_terms_to_test( elif HpoMtcFilter.one_genotype_has_zero_hpo_observations( counts_frame, - gt_predicate.get_categories(), + categories, ): reason = "Skipping term because one genotype had zero observations" reason_for_filtering_out[reason] += 1 @@ -454,26 +663,12 @@ def _get_ordered_terms( ("leaves" in the graph of observed terms, but not necessarily leaves in the entire HPO graph) to the most general terms. """ - # When building the induced graph based on the `term_ids`, we do not want to - # include the term for *Phenotypic abnormality* and *All*. - ancestors_to_ignore = set( - self._hpo.graph.get_ancestors( - hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY - ) - ) - ancestors_to_ignore.add(hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY) - # A "leaf" refers to an annotated term none of whose children is annotated. # May or may not be leaf in HPO graph all_term_ids = set() for term_id in term_ids: all_term_ids.add(term_id) - all_term_ids.update( - filter( - lambda t: t not in ancestors_to_ignore, - self._hpo.graph.get_ancestors(term_id), - ) - ) + all_term_ids.update(self._hpo.graph.get_ancestors(term_id)) # get all of the leaf terms hpo_leaf_term_id_set = set() @@ -489,7 +684,7 @@ def _get_ordered_terms( ordered_term_list = list() # reset, if needed queue = deque(hpo_leaf_term_id_set) - seen_terms = set(ancestors_to_ignore) + seen_terms = set() while len(queue) > 0: term_id = queue.popleft() if term_id not in seen_terms: diff --git a/src/gpsea/analysis/pcats/__init__.py b/src/gpsea/analysis/pcats/__init__.py new file mode 100644 index 00000000..6a82d92f --- /dev/null +++ b/src/gpsea/analysis/pcats/__init__.py @@ -0,0 +1,29 @@ +""" +The `gpsea.analysis.pcats` tests the association between genotype and phenotype groups, +if the groups can be defined in terms of discrete, unique, and non-overlapping categories. + +Each individual is assigned into a genotype and phenotype group +using :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` +and :class:`~gpsea.analysis.predicate.phenotype.PhenotypePolyPredicate` respectively. + +A contingency matrix with group counts is prepared +and the counts are tested for association using :class:`~gpsea.analysis.pcats.stats.CountStatistic`, +such as :class:`~gpsea.analysis.pcats.stats.ScipyFisherExact`. + +It is typical to test several phenotype groups at the same time. +Therefore, we must correct for multiple testing to prevent false positive findings. +See :ref:`MTC section ` for more info. + +The results are provided as :class:`MultiPhenotypeAnalysisResult` +(or more specific :class:`HpoTermAnalysisResult` for :class:`HpoTermAnalysis`). +""" + +from ._impl import MultiPhenotypeAnalysis, MultiPhenotypeAnalysisResult +from ._impl import DiseaseAnalysis +from ._impl import HpoTermAnalysis, HpoTermAnalysisResult + +__all__ = [ + 'MultiPhenotypeAnalysis', 'MultiPhenotypeAnalysisResult', + 'DiseaseAnalysis', + 'HpoTermAnalysis', 'HpoTermAnalysisResult', +] diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py new file mode 100644 index 00000000..61841a51 --- /dev/null +++ b/src/gpsea/analysis/pcats/_impl.py @@ -0,0 +1,561 @@ +import abc +import math +import typing + +from collections import Counter + +import hpotk +import numpy as np +import pandas as pd + +from statsmodels.stats import multitest + +from gpsea.model import Patient +from gpsea.analysis.pcats.stats import CountStatistic +from ..predicate import PatientCategory +from ..predicate.genotype import GenotypePolyPredicate +from ..predicate.phenotype import P, PhenotypePolyPredicate, prepare_predicates_for_terms_of_interest +from ..mtc_filter import PhenotypeMtcFilter, PhenotypeMtcResult + + +def apply_predicates_on_patients( + patients: typing.Iterable[Patient], + gt_predicate: GenotypePolyPredicate, + pheno_predicates: typing.Sequence[PhenotypePolyPredicate[P]], +) -> typing.Tuple[ + typing.Sequence[int], + typing.Sequence[pd.DataFrame], +]: + """ + Apply the phenotype predicates `pheno_predicates` and the genotype predicate `gt_predicate` + to bin the `patients` into categories. + + Note, it may not be possible to bin *all* patients with a genotype/phenotype pair, + since a predicate is allowed to return `None` (e.g. if it bins the patient into MISSENSE or NONSENSE groups + but the patient has no MISSENSE or NONSENSE variants). If this happens, the patient will not be "usable" + for the phenotype `P`. + + Args: + patients: a sequence of the patients to bin into categories + gt_predicate: a genotype predicate to apply + pheno_predicates: a sequence with the phenotype predicates to apply + + Returns: + a tuple with 2 items: + - a sequence with counts of patients that could be binned according to the phenotype `P`. + - a sequence with data frames with counts of patients in i-th phenotype category + and j-th genotype category where i and j are rows and columns of the data frame. + """ + n_usable_patient_counter = Counter() + + # Apply genotype and phenotype predicates + count_dict = {} + for ph_predicate in pheno_predicates: + if ph_predicate.phenotype not in count_dict: + # Make an empty frame for keeping track of the counts. + count_dict[ph_predicate.phenotype] = pd.DataFrame( + data=0, + index=pd.Index( + data=ph_predicate.get_categories(), + name=ph_predicate.get_question(), + ), + columns=pd.Index( + data=gt_predicate.get_categories(), + name=gt_predicate.get_question(), + ), + ) + + for patient in patients: + pheno_cat = ph_predicate.test(patient) + geno_cat = gt_predicate.test(patient) + + if pheno_cat is not None and geno_cat is not None: + count_dict[pheno_cat.phenotype].loc[ + pheno_cat.category, geno_cat.category + ] += 1 + n_usable_patient_counter[pheno_cat.phenotype] += 1 + + # Convert dicts to numpy arrays + n_usable_patients = [ + n_usable_patient_counter[ph_predicate.phenotype] + for ph_predicate in pheno_predicates + ] + + counts = [count_dict[ph_predicate.phenotype] for ph_predicate in pheno_predicates] + + return n_usable_patients, counts + + +class MultiPhenotypeAnalysisResult(typing.Generic[P], metaclass=abc.ABCMeta): + """ + `MultiPhenotypeAnalysisResult` reports the outcome of :class:`MultiPhenotypeAnalysis`. + + The result consists of several arrays with items + with the order determined by the phenotype order. + """ + + @property + @abc.abstractmethod + def phenotypes(self) -> typing.Sequence[P]: + """ + Get the tested phenotypes. + """ + pass + + @property + @abc.abstractmethod + def n_usable(self) -> typing.Sequence[int]: + """ + Get a sequence of numbers of patients where the phenotype was assessable, + and are, thus, usable for genotype-phenotype correlation analysis. + """ + pass + + @property + @abc.abstractmethod + def all_counts(self) -> typing.Sequence[pd.DataFrame]: + """ + Get a :class:`~pandas.DataFrame` sequence where each `DataFrame` includes the counts of patients + in genotype and phenotype groups. + + An example for a genotype predicate that bins into two categories (`Yes` and `No`) based on presence + of a missense variant in transcript `NM_123456.7`, and phenotype predicate that checks + presence/absence of `HP:0001166` (a phenotype term):: + + Has MISSENSE_VARIANT in NM_123456.7 + No Yes + Present + Yes 1 13 + No 7 5 + + The rows correspond to the phenotype categories, and the columns represent the genotype categories. + """ + pass + + @property + @abc.abstractmethod + def pvals(self) -> typing.Sequence[float]: + """ + Get a sequence of nominal p values for each tested HPO term. + The sequence includes a `NaN` value for each input phenotype that was *not* tested. + """ + pass + + @property + @abc.abstractmethod + def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]: + """ + Get a sequence with p values for each tested HPO term after multiple testing correction + or `None` if the correction was not applied. + The sequence includes a `NaN` value for each input phenotype that was *not* tested. + """ + pass + + @property + def total_tests(self) -> int: + """ + Get total count of tests that were run for this analysis. + """ + return sum(1 for pval in self.pvals if not math.isnan(pval)) + + def summarize( + self, + hpo: hpotk.MinimalOntology, + target_phenotype_category: PatientCategory, + ) -> pd.DataFrame: + """ + Create a data frame with summary of the genotype phenotype analysis. + + The *rows* of the frame correspond to the analyzed HPO terms. + + The columns of the data frame have `Count` and `Percentage` per used genotype predicate. + + **Example** + + If we use :class:`~gpsea.analysis.predicate.genotype.VariantEffectPredicate` + which can compare phenotype with and without a missense variant, we will have a data frame + that looks like this:: + + MISSENSE_VARIANT on `NM_1234.5` No Yes + Count Percent Count Percent Corrected p value p value + Arachnodactyly [HP:0001166] 1/10 10% 13/16 81% 0.020299 0.000781 + Abnormality of the musculature [HP:0003011] 6/6 100% 11/11 100% 1.000000 1.000000 + Abnormal nervous system physiology [HP:0012638] 9/9 100% 15/15 100% 1.000000 1.000000 + ... ... ... ... ... ... ... + """ + + # TODO: this should be plotted by a formatter. + # Row index: a list of tested HPO terms + pheno_idx = pd.Index(self.phenotypes) + # Column index: multiindex of counts and percentages for all genotype predicate groups + gt_idx = pd.MultiIndex.from_product( + # TODO: fix the below + iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")), + names=(self._gt_predicate.get_question(), None), + ) + + # We'll fill this frame with data + df = pd.DataFrame(index=pheno_idx, columns=gt_idx) + + for phenotype, count in zip(self.phenotypes, self.all_counts): + gt_totals = ( + count.sum() + ) # Sum across the phenotype categories (collapse the rows). + for gt_cat in count.columns: + cnt = count.loc[target_phenotype_category, gt_cat] + total = gt_totals[gt_cat] + df.loc[phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" + pct = 0 if total == 0 else round(cnt * 100 / total) + df.loc[phenotype, (gt_cat, "Percent")] = f"{pct}%" + + # Add columns with p values and corrected p values (if present) + p_val_col_name = "p values" + corrected_p_val_col_name = "Corrected p values" + if self.corrected_pvals is not None: + df.insert( + df.shape[1], ("", corrected_p_val_col_name), self.corrected_pvals + ) + df.insert(df.shape[1], ("", p_val_col_name), self.pvals) + + # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms + # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). + labeled_idx = df.index.map( + lambda term_id: MultiPhenotypeAnalysisResult._format_term_id(hpo, term_id) + ) + + # Last, sort by corrected p value or just p value + df = df.set_index(labeled_idx) + if self.corrected_pvals is not None: + return df.sort_values( + by=[("", corrected_p_val_col_name), ("", p_val_col_name)] + ) + else: + return df.sort_values(by=("", p_val_col_name)) + + @staticmethod + def _format_term_id( + hpo: hpotk.MinimalOntology, + term_id: hpotk.TermId, + ) -> str: + """ + Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs + are formatted as CURIEs (e.g. `OMIM:123000`). + """ + if term_id.prefix == "HP": + term_name = hpo.get_term_name(term_id) + return f"{term_name} [{term_id.value}]" + else: + return term_id.value + + +class MultiPhenotypeAnalysis(typing.Generic[P], metaclass=abc.ABCMeta): + + def __init__( + self, + count_statistic: CountStatistic, + mtc_correction: typing.Optional[str] = None, + mtc_alpha: float = 0.05, + ): + assert isinstance(count_statistic, CountStatistic) + self._count_statistic = count_statistic + self._mtc_correction = mtc_correction + self._mtc_alpha = mtc_alpha + + @abc.abstractmethod + def compare_genotype_vs_phenotypes( + self, + cohort: typing.Iterable[Patient], + gt_predicate: GenotypePolyPredicate, + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], + ) -> MultiPhenotypeAnalysisResult[P]: + pass + + def _compute_nominal_pvals( + self, + n_usable: typing.Iterable[int], + all_counts: typing.Iterable[pd.DataFrame], + ) -> np.ndarray: + pvals = [] + for i, (usable, count) in enumerate(zip(n_usable, all_counts)): + if usable == 0: + pvals.append(np.nan) + else: + try: + pval = self._count_statistic.compute_pval(count) + pvals.append(pval) + except ValueError as ve: + # TODO: add more context to the exception? + raise ve + + return np.array(pvals) + + def _apply_mtc( + self, + pvals: typing.Sequence[float], + ) -> typing.Optional[typing.Sequence[float]]: + _, corrected_pvals, _, _ = multitest.multipletests( + pvals=pvals, + alpha=self._mtc_alpha, + method=self._mtc_correction, + is_sorted=False, + returnsorted=False, + ) + return corrected_pvals + + +class BaseMultiPhenotypeAnalysisResult(typing.Generic[P], MultiPhenotypeAnalysisResult[P]): + + def __init__( + self, + phenotypes: typing.Sequence[P], + n_usable: typing.Sequence[int], + all_counts: typing.Sequence[pd.DataFrame], + pvals: typing.Sequence[float], + corrected_pvals: typing.Optional[typing.Sequence[float]], + gt_predicate: GenotypePolyPredicate, + ): + self._phenotypes = tuple(phenotypes) + self._n_usable = tuple(n_usable) + self._all_counts = tuple(all_counts) + self._pvals = tuple(pvals) + self._corrected_pvals = ( + None if corrected_pvals is None else tuple(corrected_pvals) + ) + assert isinstance(gt_predicate, GenotypePolyPredicate) + self._gt_predicate = gt_predicate + + @property + def phenotypes(self) -> typing.Sequence[P]: + return self._phenotypes + + @property + def n_usable(self) -> typing.Sequence[int]: + return self._n_usable + + @property + def all_counts(self) -> typing.Sequence[pd.DataFrame]: + return self._all_counts + + @property + def pvals(self) -> typing.Sequence[float]: + return self._pvals + + @property + def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]: + return self._corrected_pvals + + def __eq__(self, other): + return isinstance(other, BaseMultiPhenotypeAnalysisResult) \ + and self._phenotypes == other._phenotypes \ + and self._n_usable == other._n_usable \ + and self._all_counts == other._all_counts \ + and self._pvals == other._pvals \ + and self._corrected_pvals == other._corrected_pvals\ + and self._gt_predicate == other._gt_predicate + + def __hash__(self): + # NOTE: if a field is added, the hash method of the subclasses must be updated as well. + return hash(( + self._phenotypes, self._n_usable, self._all_counts, + self._pvals, self._corrected_pvals, self._gt_predicate, + )) + + +class DiseaseAnalysis(MultiPhenotypeAnalysis[hpotk.TermId]): + + def compare_genotype_vs_phenotypes( + self, + cohort: typing.Iterable[Patient], + gt_predicate: GenotypePolyPredicate, + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]], + ) -> MultiPhenotypeAnalysisResult[hpotk.TermId]: + pheno_predicates = tuple(pheno_predicates) + if len(pheno_predicates) == 0: + raise ValueError("No phenotype predicates were provided") + + # 1 - Count the patients + n_usable, all_counts = apply_predicates_on_patients( + patients=cohort, + gt_predicate=gt_predicate, + pheno_predicates=pheno_predicates, + ) + + # 2 - Compute nominal p values + pvals = self._compute_nominal_pvals(n_usable=n_usable, all_counts=all_counts) + + # 3 - Apply Multiple Testing Correction + if self._mtc_correction is None: + corrected_pvals = None + else: + corrected_pvals = self._apply_mtc(pvals=pvals) + + phenotypes = tuple(p.phenotype for p in pheno_predicates) + + return BaseMultiPhenotypeAnalysisResult( + phenotypes=phenotypes, + n_usable=n_usable, + all_counts=all_counts, + pvals=pvals, + corrected_pvals=corrected_pvals, + gt_predicate=gt_predicate, + ) + + +class HpoTermAnalysisResult(BaseMultiPhenotypeAnalysisResult[hpotk.TermId]): + """ + `HpoTermAnalysisResult` includes the :class:`HpoTermAnalysis` results. + + On top of the attributes of :class:`MultiPhenotypeAnalysisResult` parent, + the results include the outcome of :class:`PhenotypeMtcFilter`. + """ + + def __init__( + self, + phenotypes: typing.Sequence[hpotk.TermId], + n_usable: typing.Sequence[int], + all_counts: typing.Sequence[pd.DataFrame], + pvals: typing.Sequence[float], + corrected_pvals: typing.Optional[typing.Sequence[float]], + gt_predicate: GenotypePolyPredicate, + mtc_filter_name: str, + mtc_filter_results: typing.Sequence[PhenotypeMtcResult], + mtc_name: typing.Optional[str], + ): + super().__init__( + phenotypes=phenotypes, + n_usable=n_usable, + all_counts=all_counts, + pvals=pvals, + corrected_pvals=corrected_pvals, + gt_predicate=gt_predicate, + ) + self._mtc_filter_name = mtc_filter_name + self._mtc_filter_results = tuple(mtc_filter_results) + self._mtc_name = mtc_name + + @property + def mtc_filter_name(self) -> str: + """ + Get the MTC filter name. + """ + return self._mtc_filter_name + + @property + def mtc_filter_results(self) -> typing.Sequence[PhenotypeMtcResult]: + """ + Get a :class:`PhenotypeMtcResult` for each of the :attr:`phenotypes`. + """ + return self._mtc_filter_results + + @property + def mtc_name(self) -> typing.Optional[str]: + """ + Get the name of the multiple testing correction (MTC) procedure (e.g. `bonferroni`, `fdr_bh`, ...) + or `None` if no MTC was performed. + """ + return self._mtc_name + + def __eq__(self, other): + return isinstance(other, HpoTermAnalysisResult) \ + and BaseMultiPhenotypeAnalysisResult.__eq__(self, other) \ + and self._mtc_filter_name == other._mtc_filter_name \ + and self._mtc_filter_results == other._mtc_filter_results \ + and self._mtc_name == other._mtc_name + + def __hash__(self): + return hash(( + self._phenotypes, self._n_usable, self._all_counts, + self._pvals, self._corrected_pvals, self._gt_predicate, + self._mtc_filter_name, self._mtc_filter_results, self._mtc_name, + )) + + +class HpoTermAnalysis(MultiPhenotypeAnalysis[hpotk.TermId]): + """ + `HpoTermAnalysis` can be applied if the individual phenotypes are represented as HPO terms. + + The analysis applies the genotype and phenotype predicates, computes the nominal p values, + and addresses the multiple testing burden by applying the :class:`PhenotypeMtcFilter` + followed by the multiple testing correction `mtc_correction` method. + + `PhenotypeMtcFilter` is applied even if no MTC should be applied. + """ + + def __init__( + self, + count_statistic: CountStatistic, + mtc_filter: PhenotypeMtcFilter, + mtc_correction: typing.Optional[str] = None, + mtc_alpha: float = 0.05, + ): + super().__init__( + count_statistic=count_statistic, + mtc_correction=mtc_correction, + mtc_alpha=mtc_alpha, + ) + assert isinstance(mtc_filter, PhenotypeMtcFilter) + self._mtc_filter = mtc_filter + + def compare_genotype_vs_phenotypes( + self, + cohort: typing.Iterable[Patient], + gt_predicate: GenotypePolyPredicate, + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]], + ) -> HpoTermAnalysisResult: + pheno_predicates = tuple(pheno_predicates) + if len(pheno_predicates) == 0: + raise ValueError("No phenotype predicates were provided") + phenotypes = tuple(p.phenotype for p in pheno_predicates) + + # 1 - Count the patients + n_usable, all_counts = apply_predicates_on_patients( + cohort, + gt_predicate, + pheno_predicates, + ) + + # 2 - Apply MTC filter and select p values to MTC + mtc_filter_results = self._mtc_filter.filter( + gt_predicate=gt_predicate, + phenotypes=phenotypes, + counts=all_counts, + ) + mtc_mask = np.array([r.is_passed() for r in mtc_filter_results]) + + # 3 - Compute nominal p values + pvals = np.full(shape=(len(n_usable),), fill_value=np.nan) + pvals[mtc_mask] = self._compute_nominal_pvals( + n_usable=slice_list_in_numpy_style(n_usable, mtc_mask), + all_counts=slice_list_in_numpy_style(all_counts, mtc_mask), + ) + + # 4 - Apply Multiple Testing Correction + if self._mtc_correction is None: + corrected_pvals = None + else: + corrected_pvals = np.full(shape=pvals.shape, fill_value=np.nan) + # Do not test the p values that have been filtered out. + corrected_pvals[mtc_mask] = self._apply_mtc(pvals=pvals[mtc_mask]) + + return HpoTermAnalysisResult( + phenotypes=phenotypes, + n_usable=n_usable, + all_counts=all_counts, + pvals=pvals, + corrected_pvals=corrected_pvals, + gt_predicate=gt_predicate, + mtc_filter_name=self._mtc_filter.filter_method_name(), + mtc_filter_results=mtc_filter_results, + mtc_name=self._mtc_correction, + ) + + +WHATEVER = typing.TypeVar('WHATEVER') + + +def slice_list_in_numpy_style( + a: typing.Sequence[WHATEVER], + mask: typing.Sequence[bool], +) -> typing.Sequence[WHATEVER]: + assert len(a) == len(mask) + return [item for item, mask_is_set in zip(a, mask) if mask_is_set] diff --git a/src/gpsea/analysis/pcats/stats/__init__.py b/src/gpsea/analysis/pcats/stats/__init__.py new file mode 100644 index 00000000..4d7e0543 --- /dev/null +++ b/src/gpsea/analysis/pcats/stats/__init__.py @@ -0,0 +1,5 @@ +from ._stats import CountStatistic, ScipyFisherExact, PythonMultiFisherExact + +__all__ = [ + 'CountStatistic', 'ScipyFisherExact', 'PythonMultiFisherExact', +] diff --git a/src/gpsea/analysis/pcats/stats/_stats.py b/src/gpsea/analysis/pcats/stats/_stats.py new file mode 100644 index 00000000..8df07fdc --- /dev/null +++ b/src/gpsea/analysis/pcats/stats/_stats.py @@ -0,0 +1,177 @@ +import abc +import math +from decimal import Decimal + + +import numpy as np +import pandas as pd + +import scipy + + +class CountStatistic(metaclass=abc.ABCMeta): + """ + `CountStatistic` calculates a p value for a contingency table + produced by a pair of discrete random variables. + + The `counts` table is usually `2x2` or `2x3`. + """ + + @abc.abstractmethod + def compute_pval( + self, + counts: pd.DataFrame, + ) -> float: + pass + + +class ScipyFisherExact(CountStatistic): + """ + `ScipyFisherExact` performs Fisher Exact Test on a `2x2` contingency table. + + The class is a thin wrapper around Scipy :func:`~scipy.stats.fisher_exact` function. + The two-sided :math:`H_1` is considered. + """ + + def compute_pval( + self, + counts: pd.DataFrame, + ) -> float: + assert counts.shape == ( + 2, + 2, + ), f"Cannot run Fisher exact test on an array with {counts.shape} shape" + _, pval = scipy.stats.fisher_exact(counts.values, alternative="two-sided") + return pval + + +class PythonMultiFisherExact(CountStatistic): + """ + `PythonMultiFisherExact` is a Python implementation of Fisher Exact Test to compute + p value for a `2x3` contingency table. + """ + + def compute_pval( + self, + counts: pd.DataFrame, + ) -> float: + PythonMultiFisherExact._check_input(counts) + return self._fisher_exact(counts.values) + + @staticmethod + def _check_input(a: pd.DataFrame): + if not isinstance(a, pd.DataFrame): + raise ValueError(f"Expected a pandas DataFrame but got {type(a)}") + if not a.shape == (2, 3): + raise ValueError(f"Shape of the array must be (2, 3) but got {a.shape}") + if np.array_equal(a.values, np.zeros_like(a)): + raise ValueError("Data frame is all zeros, cannot run analysis") + + def _fisher_exact( + self, + table: np.ndarray, + ): + row_sum = [] + col_sum = [] + + for i in range(len(table)): + temp = 0 + for j in range(len(table[0])): + temp += table[i][j] + row_sum.append(temp) + + for j in range(len(table[0])): + temp = 0 + for i in range(len(table)): + temp += table[i][j] + col_sum.append(temp) + + mat = [[0] * len(col_sum)] * len(row_sum) + pos = (0, 0) + + p_0 = 1 + + for x in row_sum: + p_0 *= math.factorial(x) + for y in col_sum: + p_0 *= math.factorial(y) + + n = 0 + for x in row_sum: + n += x + p_0 /= Decimal(math.factorial(n)) + + for i in range(len(table)): + for j in range(len(table[0])): + p_0 /= Decimal(math.factorial(table[i][j])) + + p = [0] + self._dfs(mat, pos, row_sum, col_sum, p_0, p) + + return float(p[0]) + + def _dfs(self, mat, pos, r_sum, c_sum, p_0, p): + + (xx, yy) = pos + (r, c) = (len(r_sum), len(c_sum)) + + mat_new = [] + + for i in range(len(mat)): + temp = [] + for j in range(len(mat[0])): + temp.append(mat[i][j]) + mat_new.append(temp) + + if xx == -1 and yy == -1: + for i in range(r - 1): + temp = r_sum[i] + for j in range(c - 1): + temp -= mat_new[i][j] + mat_new[i][c - 1] = temp + for j in range(c - 1): + temp = c_sum[j] + for i in range(r - 1): + temp -= mat_new[i][j] + mat_new[r - 1][j] = temp + temp = r_sum[r - 1] + for j in range(c - 1): + temp -= mat_new[r - 1][j] + if temp < 0: + return + mat_new[r - 1][c - 1] = temp + + p_1 = 1 + for x in r_sum: + p_1 *= math.factorial(x) + for y in c_sum: + p_1 *= math.factorial(y) + + n = 0 + for x in r_sum: + n += x + p_1 /= Decimal(math.factorial(n)) + + for i in range(len(mat_new)): + for j in range(len(mat_new[0])): + p_1 /= Decimal(math.factorial(mat_new[i][j])) + if p_1 <= p_0 + Decimal(0.00000001): + # print(mat_new) + # print(p_1) + p[0] += p_1 + else: + max_1 = r_sum[xx] + max_2 = c_sum[yy] + for j in range(c): + max_1 -= mat_new[xx][j] + for i in range(r): + max_2 -= mat_new[i][yy] + for k in range(min(max_1, max_2) + 1): + mat_new[xx][yy] = k + if xx == r - 2 and yy == c - 2: + pos_new = (-1, -1) + elif xx == r - 2: + pos_new = (0, yy + 1) + else: + pos_new = (xx + 1, yy) + self._dfs(mat_new, pos_new, r_sum, c_sum, p_0, p) diff --git a/src/gpsea/analysis/pcats/stats/_test__stats.py b/src/gpsea/analysis/pcats/stats/_test__stats.py new file mode 100644 index 00000000..aa254306 --- /dev/null +++ b/src/gpsea/analysis/pcats/stats/_test__stats.py @@ -0,0 +1,33 @@ +import numpy as np +import pandas as pd +import pytest + +from ._stats import PythonMultiFisherExact + + +class TestPythonMultiFisherExact: + + @pytest.fixture + def fisher_exact(self) -> PythonMultiFisherExact: + return PythonMultiFisherExact() + + @pytest.mark.parametrize( + "counts, expected", + ( + [[[2, 1, 0], [3, 0, 2]], 0.6428571428571429], + [[[5, 5, 5], [5, 5, 5]], 1], + [[[10, 2, 3], [1, 3, 4]], 0.03952977071835599], + [[[6, 0, 0], [6, 5, 8]], 0.02334274421230943], + ), + ) + def test_compute_pval( + self, + counts, + expected, + fisher_exact: PythonMultiFisherExact, + ): + contingency_matrix = pd.DataFrame(np.array(counts, dtype=np.int64)) + + final_pval = fisher_exact.compute_pval(contingency_matrix) + assert final_pval == pytest.approx(expected) + diff --git a/src/gpsea/analysis/predicate/__init__.py b/src/gpsea/analysis/predicate/__init__.py index a2e0cb54..b4f3b28b 100644 --- a/src/gpsea/analysis/predicate/__init__.py +++ b/src/gpsea/analysis/predicate/__init__.py @@ -1,12 +1,7 @@ -from . import genotype -from . import phenotype - from ._api import PatientCategory, PatientCategories, Categorization, C from ._api import PolyPredicate -from ._api import GenotypePolyPredicate, GenotypeBooleanPredicate __all__ = [ 'PatientCategory', 'PatientCategories', 'Categorization', 'C', 'PolyPredicate', - 'GenotypePolyPredicate', 'GenotypeBooleanPredicate', ] diff --git a/src/gpsea/analysis/predicate/_api.py b/src/gpsea/analysis/predicate/_api.py index 2f1a62ea..8c62e191 100644 --- a/src/gpsea/analysis/predicate/_api.py +++ b/src/gpsea/analysis/predicate/_api.py @@ -148,11 +148,38 @@ def get_categorizations(self) -> typing.Sequence[C]: """ pass - def get_categories(self) -> typing.Sequence[PatientCategory]: + def get_categories(self) -> typing.Iterator[PatientCategory]: """ - Get a sequence with :class:`PatientCategory` instances that the predicate can produce. + Get an iterator with :class:`PatientCategory` instances that the predicate can produce. """ - return tuple(c.category for c in self.get_categorizations()) + return (c.category for c in self.get_categorizations()) + + def get_category( + self, + cat_id: int, + ) -> PatientCategory: + """ + Get the category name for a :class:`PatientCategory.cat_id`. + + :param cat_id: an `int` with the id. + :raises: ValueError if there is no such category was defined. + """ + for ctg in self.get_categories(): + if ctg.cat_id == cat_id: + return ctg.name + raise ValueError(f'No category for {cat_id} was found') + + def get_category_name( + self, + cat_id: int, + ) -> str: + """ + Get the category name for a :class:`PatientCategory.cat_id`. + + :param cat_id: an `int` with the id. + :raises: ValueError if there is no such category was defined. + """ + return self.get_category(cat_id).name @abc.abstractmethod def get_question(self) -> str: @@ -183,35 +210,3 @@ def _check_patient(patient: Patient): """ if not isinstance(patient, Patient): raise ValueError(f"patient must be type Patient but was type {type(patient)}") - - -class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta): - """ - `GenotypePolyPredicate` constrains `PolyPredicate` to investigate the genotype aspects - of patients. - """ - pass - - -class GenotypeBooleanPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta): - """ - `GenotypeBooleanPredicate` tests if a :class:`~gpsea.model.Patient` belongs to a genotype group - and returns a boolean binning. - """ - YES = Categorization(PatientCategories.YES) - NO = Categorization(PatientCategories.NO) - - def get_categorizations(self) -> typing.Sequence[Categorization]: - """ - The predicate bins a patient into :class:`BooleanPredicate.NO` or :class:`BooleanPredicate.YES` category. - """ - return GenotypeBooleanPredicate.YES, GenotypeBooleanPredicate.NO - - -class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta): - BOTH = Categorization(PatientCategory(0, 'Both', 'The patient belongs in both groups.')) - ONE = Categorization(PatientCategory(1, 'One', 'The patient belongs in one of the two groups.')) - NEITHER = Categorization(PatientCategory(2, 'Neither', 'The patient does not belong in either group.')) - - def get_categorizations(self) -> typing.Sequence[Categorization]: - return RecessiveGroupingPredicate.BOTH, RecessiveGroupingPredicate.ONE, RecessiveGroupingPredicate.NEITHER diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index feff1703..a8a2ba91 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,9 +1,11 @@ +from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter from ._gt_predicates import boolean_predicate, groups_predicate, recessive_predicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ + 'GenotypePolyPredicate', 'boolean_predicate', 'groups_predicate', 'recessive_predicate', 'AlleleCounter', 'VariantPredicate', 'VariantPredicates', 'ProteinPredicates', diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py index 5e1a201f..39cf020b 100644 --- a/src/gpsea/analysis/predicate/genotype/_api.py +++ b/src/gpsea/analysis/predicate/genotype/_api.py @@ -2,13 +2,31 @@ import typing from gpsea.model import Variant +from .._api import PolyPredicate, Categorization, PatientCategory + + +class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta): + """ + `GenotypePolyPredicate` is a base class for all :class:`PolyPredicate` + that test the genotype axis. + """ + pass + + +class RecessiveGroupingPredicate(GenotypePolyPredicate, metaclass=abc.ABCMeta): + BOTH = Categorization(PatientCategory(0, 'Both', 'The patient belongs in both groups.')) + ONE = Categorization(PatientCategory(1, 'One', 'The patient belongs in one of the two groups.')) + NEITHER = Categorization(PatientCategory(2, 'Neither', 'The patient does not belong in either group.')) + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return RecessiveGroupingPredicate.BOTH, RecessiveGroupingPredicate.ONE, RecessiveGroupingPredicate.NEITHER class VariantPredicate(metaclass=abc.ABCMeta): """ `VariantPredicate` tests if a variant meets a certain criterion. - The subclasses are expected to implement all abstract methods of this class + The subclasses are expected to implement all abstract methods of this class *plus* ``__eq__`` and ``__hash__``, to support building of compound predicates. We *strongly* recommend implementing ``__str__`` and ``__repr__`` as well. @@ -43,7 +61,7 @@ def __and__(self, other): return self if isinstance(self, AllVariantPredicate) and isinstance(other, AllVariantPredicate): - # Merging two *all* variant predicates is equivalent + # Merging two *all* variant predicates is equivalent # to chaining their predicates return AllVariantPredicate(*self.predicates, *other.predicates) else: @@ -60,7 +78,7 @@ def __or__(self, other): return self if isinstance(self, AnyVariantPredicate) and isinstance(other, AnyVariantPredicate): - # Merging two any variant predicates is equivalent + # Merging two any variant predicates is equivalent # to chaining their predicates return AnyVariantPredicate(*self.predicates, *other.predicates) else: @@ -71,7 +89,7 @@ def __or__(self, other): def __invert__(self): """ Create a variant predicate that passes if the underlying predicate fails. - """ + """ if isinstance(self, InvVariantPredicate): # Unwrap to prevent double negation return self._inner @@ -89,7 +107,6 @@ def __init__( if len(predicates) == 0: raise ValueError('Predicates must not be empty!') self._predicates = tuple(predicates) - @property def predicates(self) -> typing.Sequence[VariantPredicate]: @@ -169,4 +186,3 @@ def __str__(self) -> str: def __repr__(self) -> str: return f'NotVariantPredicate(inner={self._inner})' - \ No newline at end of file diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index be0251d2..3f54a7e0 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -2,17 +2,19 @@ from gpsea.model import Patient -from .._api import Categorization -from .._api import GenotypePolyPredicate, GenotypeBooleanPredicate, RecessiveGroupingPredicate +from .._api import Categorization, PatientCategories +from ._api import GenotypePolyPredicate, RecessiveGroupingPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -class AlleleCountingGenotypeBooleanPredicate(GenotypeBooleanPredicate): +class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate): # NOT PART OF THE PUBLIC API """ The predicate tests presence of at least one matching allele in the patient. """ + YES = Categorization(PatientCategories.YES) + NO = Categorization(PatientCategories.NO) @staticmethod def for_variant_predicate(predicate: VariantPredicate): @@ -22,6 +24,14 @@ def for_variant_predicate(predicate: VariantPredicate): def __init__(self, allele_counter: AlleleCounter): self._allele_counter = allele_counter + def get_categorizations(self) -> typing.Sequence[Categorization]: + """ + The predicate bins a patient into + :attr:`AlleleCountingGenotypeBooleanPredicate.NO` + or :class:`AlleleCountingGenotypeBooleanPredicate.YES` category. + """ + return AlleleCountingGenotypeBooleanPredicate.YES, AlleleCountingGenotypeBooleanPredicate.NO + def get_question(self) -> str: return self._allele_counter.get_question() @@ -30,16 +40,17 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: allele_count = self._allele_counter.count(patient) if allele_count > 0: - return GenotypeBooleanPredicate.YES + return AlleleCountingGenotypeBooleanPredicate.YES elif allele_count == 0: - return GenotypeBooleanPredicate.NO + return AlleleCountingGenotypeBooleanPredicate.NO else: raise ValueError( f"Allele counter should return a non-negative allele count: {allele_count}" ) def __eq__(self, value: object) -> bool: - return isinstance(value, AlleleCountingGenotypeBooleanPredicate) and self._allele_counter == value._allele_counter + return isinstance(value, AlleleCountingGenotypeBooleanPredicate) \ + and self._allele_counter == value._allele_counter def __hash__(self) -> int: return hash((self._allele_counter,)) @@ -52,7 +63,7 @@ def __repr__(self) -> str: # TODO: write AD, AR, XLR, XLD -def boolean_predicate(variant_predicate: VariantPredicate) -> GenotypeBooleanPredicate: +def boolean_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: """ Create a genotype boolean predicate from given `variant_predicate` to test for presence of at least one matching allele in the patient. diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py index 517f5970..aeaf6e1d 100644 --- a/src/gpsea/analysis/predicate/genotype/_variant.py +++ b/src/gpsea/analysis/predicate/genotype/_variant.py @@ -30,12 +30,12 @@ def all(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: Build a predicate to test if variant has a functional annotation to genes *SURF1* and *SURF2*: >>> from gpsea.analysis.predicate.genotype import VariantPredicates - + >>> genes = ('SURF1', 'SURF2',) >>> predicate = VariantPredicates.all(VariantPredicates.gene(g) for g in genes) >>> predicate.get_question() '(impacts SURF1 AND impacts SURF2)' - + Args: predicates: an iterable of predicates to test """ @@ -55,11 +55,12 @@ def any(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: **Example** - Build a predicate to test if variant leads to a missense or non-sense change on a fictive transcript `NM_123456.7`: + Build a predicate to test if variant leads to a missense + or nonsense change on a fictional transcript `NM_123456.7`: >>> from gpsea.model import VariantEffect >>> from gpsea.analysis.predicate.genotype import VariantPredicates - + >>> tx_id = 'NM_123456.7' >>> effects = (VariantEffect.MISSENSE_VARIANT, VariantEffect.STOP_GAINED,) >>> predicate = VariantPredicates.any(VariantPredicates.variant_effect(e, tx_id) for e in effects) @@ -140,7 +141,8 @@ def exon( Prepare a :class:`VariantPredicate` that tests if the variant overlaps with an exon of a specific transcript. Args: - exon: a non-negative `int` with the index of the target exon (e.g. `0` for the 1st exon, `1` for the 2nd, ...) + exon: a non-negative `int` with the index of the target exon + (e.g. `0` for the 1st exon, `1` for the 2nd, ...) tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`) """ return VariantExonPredicate(exon, tx_id) @@ -148,10 +150,12 @@ def exon( @staticmethod def region(region: Region, tx_id: str) -> VariantPredicate: """ - Prepare a :class:`VariantPredicate` that tests if the variant overlaps with a region on a protein of a specific transcript. + Prepare a :class:`VariantPredicate` that tests if the variant + overlaps with a region on a protein of a specific transcript. Args: - region: a :class:`Region` that gives the start and end coordinate of the region of interest on a protein strand. + region: a :class:`Region` that gives the start and end coordinate + of the region of interest on a protein strand. """ return ProteinRegionPredicate(region, tx_id) @@ -173,15 +177,20 @@ def is_structural_variant( SVs are usually defined as variant affecting more than a certain number of base pairs. The thresholds vary in the literature, but here we use 50bp as a default. - Any variant that affects at least `threshold` base pairs is considered an SV. - Large SVs with unknown breakpoint coordinates or translocations (:class:`VariantClass.BND`) + Any variant that affects at least `threshold` base pairs is considered an SV. + Large SVs with unknown breakpoint coordinates or translocations (:class:`VariantClass.BND`) are always considered as an SV. Args: threshold: a non-negative `int` with the number of base pairs that must be affected """ - assert threshold >= 0, '`threshold` must be non-negative!' - return VariantPredicates.change_length('<=', -threshold) | VariantPredicates.change_length('>=', threshold) | VariantPredicates.is_large_imprecise_sv() | IS_BND + assert threshold >= 0, "`threshold` must be non-negative!" + return ( + VariantPredicates.change_length("<=", -threshold) + | VariantPredicates.change_length(">=", threshold) + | VariantPredicates.is_large_imprecise_sv() + | IS_BND + ) @staticmethod def structural_type( @@ -190,7 +199,8 @@ def structural_type( """ Prepare a :class:`VariantPredicate` for testing if the variant has a certain structural type. - We recommend using a descendant of `structural_variant` (`SO:0001537 `_) + We recommend using a descendant of `structural_variant` + (`SO:0001537 `_) as the structural type. **Example** @@ -237,51 +247,12 @@ def ref_length( length: int, ) -> VariantPredicate: """ - Prepare a :class:`VariantPredicate` for testing if the reference (REF) allele + Prepare a :class:`VariantPredicate` for testing if the reference (REF) allele of variant is above, below, or (not) equal to certain `length`. - The length of the REF allele corresponds to the length of the genomic region affected by the variant. - Let's show a few examples. - - >>> from gpsea.model import VariantCoordinates - >>> from gpsea.model.genome import GRCh38 - >>> chr1 = GRCh38.contig_by_name("chr1") - - The length of the reference allele of a missense variant is 1 - because the variant affects a 1-bp region spanned by the ``C`` nucleotide: - - >>> missense = VariantCoordinates.from_vcf_literal(chr1, 1001, 'C', 'T') - >>> len(missense) - 1 - - The length of a "small" deletion is the same as the length of the ref allele `str`: - (``'CCC'`` in the example below): - - >>> deletion = VariantCoordinates.from_vcf_literal(chr1, 1001, 'CCC', 'C') - >>> len(deletion) - 3 - - This is because the literal notation spells out the alleles. - However, this simple rule does not apply in symbolic notation. - Here, the REF length corresponds to the length of the allele region. - - For instance, for the following structural variant - - .. code:: - - #CHROM POS ID REF ALT QUAL FILTER INFO - 1 1001 . C 6 PASS SVTYPE=DEL;END=1003;SVLEN=-3 - - the length of the REF allele is `3`: - - >>> sv_deletion = VariantCoordinates.from_vcf_symbolic( - ... chr1, pos=1001, end=1003, - ... ref='C', alt='', svlen=-3, - ... ) - >>> len(sv_deletion) - 3 + .. seealso:: - because the deletion removes 3 base pairs at the coordinates :math:`[1001, 1003]`. + See :ref:`length-of-the-reference-allele` for more info. **Example** @@ -291,13 +262,12 @@ def ref_length( >>> predicate = VariantPredicates.ref_length('>', 5) >>> predicate.get_question() 'ref allele length > 5' - + Args: operator: a `str` with the desired test. Must be one of ``{ '<', '<=', '==', '!=', '>=', '>' }``. length: a non-negative `int` with the length threshold. """ return RefAlleleLengthPredicate(operator, length) - @staticmethod def change_length( @@ -310,11 +280,11 @@ def change_length( .. seealso:: - See :meth:`gpsea.model.VariantCoordinates.change_length` for more info on change length. + See :ref:`change-length-of-an-allele` for more info. **Example** - Make a predicate for testing if the change length is less than or equal to `-10`, + Make a predicate for testing if the change length is less than or equal to `-10`, e.g. to test if a variant is a *deletion* leading to removal of at least 10 base pairs: >>> from gpsea.analysis.predicate.genotype import VariantPredicates @@ -333,15 +303,15 @@ def is_structural_deletion( threshold: int = -50, ) -> VariantPredicate: """ - Prepare a :class:`VariantPredicate` for testing if the variant + Prepare a :class:`VariantPredicate` for testing if the variant is a `chromosomal deletion `_ or a structural variant deletion that leads to removal of at least *n* base pairs (50bp by default). - + .. note:: The predicate uses :meth:`~gpsea.model.VariantCoordinates.change_length` to determine if the length of the variant is above or below `threshold`. - + **IMPORTANT**: the change lengths of deletions are *negative*, since the alternate allele is shorter than the reference allele. See the method's documentation for more info. @@ -378,9 +348,7 @@ def __init__( self._protein_metadata_service = protein_metadata_service def protein_feature_type( - self, - feature_type: FeatureType, - tx_id: str + self, feature_type: FeatureType, tx_id: str ) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` that tests if the variant affects a protein feature type. @@ -392,11 +360,7 @@ def protein_feature_type( feature_type, tx_id, self._protein_metadata_service ) - def protein_feature( - self, - feature_id: str, - tx_id: str - ) -> VariantPredicate: + def protein_feature(self, feature_id: str, tx_id: str) -> VariantPredicate: """ Prepare a :class:`VariantPredicate` that tests if the variant affects a protein feature type. diff --git a/src/gpsea/analysis/predicate/phenotype/__init__.py b/src/gpsea/analysis/predicate/phenotype/__init__.py index f87a9f92..2efa1e55 100644 --- a/src/gpsea/analysis/predicate/phenotype/__init__.py +++ b/src/gpsea/analysis/predicate/phenotype/__init__.py @@ -1,9 +1,19 @@ -from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate, CountingPhenotypeScorer +""" +The `gpsea.analysis.predicate.phenotype` package provides the :class:`PhenotypePolyPredicate` +for assigning :class:`~gpsea.model.Patient` into a phenotype group. + +An individual can be assigned based on presence/absence of a disease diagnosis (:class:`DiseasePresencePredicate`) +or using the phenotype features encoded into HPO terms (:class:`PropagatingPhenotypePredicate`). +""" + +from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate from ._pheno import DiseasePresencePredicate from ._pheno import PhenotypeCategorization, P +from ._util import prepare_predicates_for_terms_of_interest, prepare_hpo_terms_of_interest __all__ = [ - 'PhenotypePolyPredicate', 'PropagatingPhenotypePredicate', 'CountingPhenotypeScorer', + 'PhenotypePolyPredicate', 'PropagatingPhenotypePredicate', 'DiseasePresencePredicate', 'PhenotypeCategorization', 'P', + 'prepare_predicates_for_terms_of_interest', 'prepare_hpo_terms_of_interest', ] diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 4b13f766..c2646f72 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -69,9 +69,9 @@ class PhenotypePolyPredicate( @property @abc.abstractmethod - def phenotypes(self) -> typing.Sequence[P]: + def phenotype(self) -> P: """ - Get the phenotype entities of interest. + Get the phenotype entity of interest. """ pass @@ -97,7 +97,7 @@ def __init__( self._query = hpotk.util.validate_instance( query, hpotk.TermId, "phenotypic_feature" ) - self._query_label = self._hpo.get_term(query) + self._query_label = self._hpo.get_term_name(query) assert self._query_label is not None, f"Query {query} is in HPO" self._missing_implies_phenotype_excluded = hpotk.util.validate_instance( missing_implies_phenotype_excluded, @@ -117,9 +117,8 @@ def get_question(self) -> str: return f"Is {self._query_label} present in the patient?" @property - def phenotypes(self) -> typing.Sequence[hpotk.TermId]: - # We usually test just a single HPO term, so we return a tuple with a single member. - return (self._query,) + def phenotype(self) -> hpotk.TermId: + return self._query def get_categorizations( self, @@ -205,9 +204,8 @@ def get_question(self) -> str: return f"Was {self._query} diagnosed in the patient" @property - def phenotypes(self) -> typing.Sequence[hpotk.TermId]: - # We usually test just a single Disease, so we return a tuple with a single member. - return (self._query,) + def phenotype(self) -> hpotk.TermId: + return self._query def get_categorizations( self, @@ -240,131 +238,3 @@ def test( def __repr__(self): return f"DiseasePresencePredicate(query={self._query})" - - -class CountingPhenotypeScorer: - """ - `CountingPhenotypeScorer` assigns the patient with a phenotype score - that is equivalent to the count of present phenotypes that are either - an exact match to the `query` terms or their descendants. - For instance, we may want to count whether an individual has brain, liver, kidney, and skin abormalities. - In the case, the query would include the corresponding terms (e.g., Abnormal brain morphology HP:0012443). - An individual can then have between 0 and 4 phenotype group abnormalities. - This predicate is intended to be used with the Mann Whitney U test. - - Example - ^^^^^^^ - We first need to load HPO using HPO toolkit: - - >>> import hpotk - >>> store = hpotk.configure_ontology_store() - >>> hpo = store.load_minimal_hpo(release='v2023-10-09') - - Now we can create `CountingPhenotypeScorer` to test for presence of brain, liver, kidney and skin abnormalities: - - >>> from gpsea.analysis.predicate.phenotype import CountingPhenotypeScorer - >>> phenotype_groups = ( - ... "HP:0012443", # Abnormal brain morphology - ... "HP:0410042", # Abnormal liver morphology - ... "HP:0012210", # Abnormal renal morphology - ... "HP:0011121", # Abnormal skin morphology - ... ) - >>> scorer = CountingPhenotypeScorer.from_query_curies( - ... hpo=hpo, - ... query=phenotype_groups, - ... ) - - """ - - @staticmethod - def from_query_curies( - hpo: hpotk.MinimalOntology, - query: typing.Iterable[typing.Union[str, hpotk.TermId]], - ): - """ - Create a scorer to test for the number of phenotype terms that fall into the phenotype groups. - - :param hpo: HPO as represented by :class:`~hpotk.ontology.MinimalOntology` of HPO toolkit. - :param query: an iterable of the top-level terms, either represented as CURIEs (`str`) - or as term IDs. - """ - query_term_ids = set() - for q in query: - # First check if the query items are Term IDs or curies. - if isinstance(q, str): - q = hpotk.TermId.from_curie(q) - elif isinstance(q, hpotk.TermId): - pass - else: - raise ValueError( - f"query argument must be iterable of hpotk TermId's or strings but we found {type(q)}" - ) - - # Now chack that the term IDs are HPO term IDs. - if q not in hpo: - raise ValueError(f"The query {q} was not found in the HPO") - query_term_ids.add(q) - - if len(query_term_ids) == 0: - raise ValueError("`query` must not be empty") - - # the query terms must not include a term and its ancestor - for q in query_term_ids: - for anc in hpo.graph.get_ancestors(q): - if anc in query_term_ids: - raise ValueError( - f"Both {q} and its ancestor term {anc} were found in the query, " - + "but query terms must not include a term and its ancestor" - ) - - return CountingPhenotypeScorer( - hpo=hpo, - query=query_term_ids, - ) - - def __init__( - self, - hpo: hpotk.MinimalOntology, - query: typing.Iterable[hpotk.TermId], - ): - self._hpo = hpo - self._query = set(query) - - def get_question(self) -> str: - return "How many of the query HPO terms (or their descendants) does the individual display" - - def score( - self, - patient: Patient, - ) -> float: - """ - Get the count (number) of terms in the query set - that have matching terms (exact matches or descendants) in the patient. - Do not double count if the patient has two terms - (e.g., two different descendants) of one of the query terms. - """ - count = 0 - for q in self._query: - for pf in patient.present_phenotypes(): - hpo_id = pf.identifier - if hpo_id == q or any( - anc == q for anc in self._hpo.graph.get_ancestors(hpo_id) - ): - count += 1 - # We break the inner loop to continue the outer. - break - - # A sanity check - we cannot produce more counts than there are categories! - assert 0 <= count <= len(self._query) - - return count - - def __call__( - self, - *args: typing.Any, - **kwds: typing.Any, - ) -> float: - # TODO: move to `PhenotypeScorer` API. - assert len(args) == 1 and isinstance(args[0], Patient), 'The first argument must be an instance of `Patient`' - assert len(kwds) == 0, 'We do not take any key-word arguments' - return self.score(args[0]) diff --git a/src/gpsea/analysis/predicate/phenotype/_util.py b/src/gpsea/analysis/predicate/phenotype/_util.py new file mode 100644 index 00000000..ac73dd02 --- /dev/null +++ b/src/gpsea/analysis/predicate/phenotype/_util.py @@ -0,0 +1,88 @@ +import typing + +from collections import Counter + +import hpotk + +from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate + +from gpsea.model import Patient + + +def prepare_predicates_for_terms_of_interest( + cohort: typing.Iterable[Patient], + hpo: hpotk.graph.GraphAware, + missing_implies_excluded: bool = False, + min_n_of_patients_with_term: int = 2, +) -> typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]]: + """ + A convenience method for creating a battery of :class:`PropagatingPhenotypePredicate` predicates + for testing all phenotypes of interest. + + :param cohort: a cohort of individuals to investigate. + :param hpo: an entity with an HPO graph (e.g. :class:`~hpotk.MinimalOntology`). + :param missing_implies_excluded: `True` if absence of an annotation should be counted as its explicit exclusion. + :param min_n_of_patients_with_term: the minimum number of patients that must feature an HPO term + (either directly or indirectly) for the term to be included in the analysis. + """ + return tuple( + PropagatingPhenotypePredicate( + hpo=hpo, + query=term, + missing_implies_phenotype_excluded=missing_implies_excluded, + ) for term in prepare_hpo_terms_of_interest( + cohort, hpo, min_n_of_patients_with_term, + ) + ) + + +def prepare_hpo_terms_of_interest( + cohort: typing.Iterable[Patient], + hpo: hpotk.graph.GraphAware, + min_n_of_patients_with_term: int = 2, +) -> typing.Sequence[hpotk.TermId]: + """ + Prepare a collection of HPO terms to test. + + This includes the direct HPO patient annotations + as well as the ancestors of the present terms and the descendants of the excluded terms. + + :param cohort: a cohort of individuals to investigate. + :param hpo: an entity with an HPO graph (e.g. :class:`~hpotk.MinimalOntology`). + :param min_n_of_patients_with_term: the minimum number of patients that must feature an HPO term + (either directly or indirectly) for the term to be included in the analysis. + """ + present_count = Counter() + excluded_count = Counter() + + for patient in cohort: + for pf in patient.phenotypes: + if pf.is_present: + # A present phenotypic feature must be counted in. + present_count[pf.identifier] += 1 + # and it also implies presence of its ancestors. + for anc in hpo.graph.get_ancestors(pf): + present_count[anc] += 1 + else: + # An excluded phenotypic feature + excluded_count[pf.identifier] += 1 + for desc in hpo.graph.get_descendants(pf): + # implies exclusion of its descendants. + excluded_count[desc] += 1 + + total_count = Counter() + for term_id, count in present_count.items(): + total_count[term_id] += count + + for term_id, count in excluded_count.items(): + total_count[term_id] += count + + final_hpo = [] + for term_id in present_count: + # Keep the term if it is mentioned at least *n* times (incl. being excluded) + # in the cohort + n_all = total_count[term_id] + if n_all >= min_n_of_patients_with_term: + final_hpo.append(term_id) + + return tuple(final_hpo) diff --git a/src/gpsea/analysis/pscore/__init__.py b/src/gpsea/analysis/pscore/__init__.py new file mode 100644 index 00000000..d03f6fd9 --- /dev/null +++ b/src/gpsea/analysis/pscore/__init__.py @@ -0,0 +1,7 @@ +from ._api import PhenotypeScorer, PhenotypeScoreAnalysis, PhenotypeScoreAnalysisResult +from ._impl import CountingPhenotypeScorer + +__all__ = [ + 'PhenotypeScorer', 'PhenotypeScoreAnalysis', 'PhenotypeScoreAnalysisResult', + 'CountingPhenotypeScorer', +] diff --git a/src/gpsea/analysis/pscore/_api.py b/src/gpsea/analysis/pscore/_api.py new file mode 100644 index 00000000..a2899734 --- /dev/null +++ b/src/gpsea/analysis/pscore/_api.py @@ -0,0 +1,162 @@ +import abc +import typing + +import pandas as pd + +from gpsea.model import Patient +from ..predicate.genotype import GenotypePolyPredicate +from .stats import PhenotypeScoreStatistic + + +class PhenotypeScorer(metaclass=abc.ABCMeta): + """ + `PhenotypeScorer` assigns the patient with a phenotype score. + """ + + def score(self, patient: Patient) -> float: + """ + Compute the score for the `patient`. + """ + pass + + +class PhenotypeScoreAnalysisResult: + """ + `PhenotypeScoreAnalysisResult` is a container for :class:`PhenotypeScoreAnalysis` results. + """ + + def __init__( + self, + genotype_phenotype_scores: pd.DataFrame, + pval: float, + ): + self._genotype_phenotype_scores = genotype_phenotype_scores + self._pval = float(pval) + + @property + def genotype_phenotype_scores(self) -> pd.DataFrame: + """ + Get the DataFrame with the genotype group and the phenotype score for each patient. + + The DataFrame has the following structure: + + ========== ======== ========= + patient_id genotype phenotype + ========== ======== ========= + patient_1 0 1 + patient_2 0 3 + patient_3 None 2 + patient_4 1 2 + ... ... ... + ========== ======== ========= + + The DataFrame index includes the patient IDs, and then there are 2 columns + with the `genotype` group id (:attr:`~gpsea.analysis.predicate.PatientCategory.cat_id`) + and the `phenotype` score. A `genotype` value may be missing if the patient + cannot be assigned into any genotype category. + """ + return self._genotype_phenotype_scores + + @property + def pval(self) -> float: + """ + Get the p value of the test. + """ + return self._pval + + def plot_boxplots( + self, + gt_predicate: GenotypePolyPredicate, + ax, + colors=["darksalmon", "honeydew"], + ): + """ + Draw box plots with distributions of phenotype scores for genotype groups + """ + # skip the patients with unassigned genotype group + not_na_gts = self._genotype_phenotype_scores["genotype"].notna() + data = self._genotype_phenotype_scores.loc[not_na_gts] + actual = set(data["genotype"].unique()) + expected = gt_predicate.get_categorizations() + x = [ + data.loc[data["genotype"] == c.category.cat_id, "phenotype"].to_list() + for c in gt_predicate.get_categorizations() + ] + + gt_cat_names = [ + c.category.name for c in gt_predicate.get_categorizations() + ] + bplot = ax.boxplot( + x=x, + patch_artist=True, + tick_labels=gt_cat_names, + ) + + for patch, color in zip(bplot["boxes"], colors): + patch.set_facecolor(color) + + +class PhenotypeScoreAnalysis: + """ + `PhenotypeScoreAnalysis` tests the association between two or more genotype groups + and a phenotype score. + + The genotype groups are created by a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` + and the phenotype score is computed with :class:`~gpsea.analysis.pscore.PhenotypeScorer`. + + The association is tested with a :class:`~gpsea.analysis.pscore.PhenotypeScoreStatistic` + and the results are reported as a :class:`PhenotypeScoreAnalysisResult`. + """ + + def __init__( + self, + score_statistic: PhenotypeScoreStatistic, + ): + assert isinstance(score_statistic, PhenotypeScoreStatistic) + self._statistic = score_statistic + + def compare_genotype_vs_phenotype_score( + self, + cohort: typing.Iterable[Patient], + gt_predicate: GenotypePolyPredicate, + pheno_scorer: PhenotypeScorer, + ) -> PhenotypeScoreAnalysisResult: + """ + Compute the association between genotype groups and phenotype score. + + :param cohort: the cohort to analyze. + :param gt_predicate: a predicate for assigning an individual into a genotype group. + :param pheno_scorer: the scorer to compute phenotype score. + """ + assert ( + gt_predicate.n_categorizations() == 2 + ), "We only support 2 genotype categories at this point" + + idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id") + data = pd.DataFrame( + None, + index=idx, + columns=["genotype", "phenotype"], + ) + + # Apply the predicates on the patients + for patient in cohort: + gt_cat = gt_predicate.test(patient) + if gt_cat is None: + data.loc[patient.patient_id, "genotype"] = None + else: + data.loc[patient.patient_id, "genotype"] = gt_cat.category.cat_id + + data.loc[patient.patient_id, "phenotype"] = pheno_scorer.score(patient) + + # Sort by PatientCategory.cat_id and unpack. + # For now, we only allow to have up to 2 groups. + x_key, y_key = sorted(data["genotype"].dropna().unique()) + x = data.loc[data["genotype"] == x_key, "phenotype"].to_numpy(dtype=float) + y = data.loc[data["genotype"] == y_key, "phenotype"].to_numpy(dtype=float) + pval = self._statistic.compute_pval(scores=(x, y)) + + return PhenotypeScoreAnalysisResult( + genotype_phenotype_scores=data, + pval=pval, + ) diff --git a/src/gpsea/analysis/pscore/_impl.py b/src/gpsea/analysis/pscore/_impl.py new file mode 100644 index 00000000..b2ce8bd2 --- /dev/null +++ b/src/gpsea/analysis/pscore/_impl.py @@ -0,0 +1,135 @@ +import typing + +import hpotk + +from gpsea.model import Patient +from ._api import PhenotypeScorer + + +class CountingPhenotypeScorer(PhenotypeScorer): + """ + `CountingPhenotypeScorer` assigns the patient with a phenotype score + that is equivalent to the count of present phenotypes that are either + an exact match to the `query` terms or their descendants. + + For instance, we may want to count whether an individual has brain, liver, kidney, and skin abormalities. + In the case, the query would include the corresponding terms (e.g., Abnormal brain morphology HP:0012443). + An individual can then have between 0 and 4 phenotype group abnormalities. + This predicate is intended to be used with the Mann Whitney U test. + + Example + ^^^^^^^ + We first need to load HPO using HPO toolkit: + + >>> import hpotk + >>> store = hpotk.configure_ontology_store() + >>> hpo = store.load_minimal_hpo(release='v2024-07-01') + + Now we can create `CountingPhenotypeScorer` to test for presence of brain, liver, kidney and skin abnormalities: + + >>> from gpsea.analysis.pscore import CountingPhenotypeScorer + >>> phenotype_groups = ( + ... "HP:0012443", # Abnormal brain morphology + ... "HP:0410042", # Abnormal liver morphology + ... "HP:0012210", # Abnormal renal morphology + ... "HP:0011121", # Abnormal skin morphology + ... ) + >>> scorer = CountingPhenotypeScorer.from_query_curies( + ... hpo=hpo, + ... query=phenotype_groups, + ... ) + + """ + + @staticmethod + def from_query_curies( + hpo: hpotk.MinimalOntology, + query: typing.Iterable[typing.Union[str, hpotk.TermId]], + ): + """ + Create a scorer to test for the number of phenotype terms that fall into the phenotype groups. + + :param hpo: HPO as represented by :class:`~hpotk.ontology.MinimalOntology` of HPO toolkit. + :param query: an iterable of the top-level terms, either represented as CURIEs (`str`) + or as term IDs. + """ + query_term_ids = set() + for q in query: + # First check if the query items are Term IDs or curies. + if isinstance(q, str): + q = hpotk.TermId.from_curie(q) + elif isinstance(q, hpotk.TermId): + pass + else: + raise ValueError( + f"query argument must be iterable of hpotk TermId's or strings but we found {type(q)}" + ) + + # Now chack that the term IDs are HPO term IDs. + if q not in hpo: + raise ValueError(f"The query {q} was not found in the HPO") + query_term_ids.add(q) + + if len(query_term_ids) == 0: + raise ValueError("`query` must not be empty") + + # the query terms must not include a term and its ancestor + for q in query_term_ids: + for anc in hpo.graph.get_ancestors(q): + if anc in query_term_ids: + raise ValueError( + f"Both {q} and its ancestor term {anc} were found in the query, " + + "but query terms must not include a term and its ancestor" + ) + + return CountingPhenotypeScorer( + hpo=hpo, + query=query_term_ids, + ) + + def __init__( + self, + hpo: hpotk.MinimalOntology, + query: typing.Iterable[hpotk.TermId], + ): + self._hpo = hpo + self._query = set(query) + + def get_question(self) -> str: + return "How many of the query HPO terms (or their descendants) does the individual display" + + def score( + self, + patient: Patient, + ) -> float: + """ + Get the count (number) of terms in the query set + that have matching terms (exact matches or descendants) in the patient. + Do not double count if the patient has two terms + (e.g., two different descendants) of one of the query terms. + """ + count = 0 + for q in self._query: + for pf in patient.present_phenotypes(): + hpo_id = pf.identifier + if hpo_id == q or any( + anc == q for anc in self._hpo.graph.get_ancestors(hpo_id) + ): + count += 1 + # We break the inner loop to continue the outer. + break + + # A sanity check - we cannot produce more counts than there are categories! + assert 0 <= count <= len(self._query) + + return count + + # def __call__( + # self, + # *args: typing.Any, + # **kwds: typing.Any, + # ) -> float: + # # TODO: move to `PhenotypeScorer` API. + # assert len(args) == 1 and isinstance(args[0], Patient), 'The first argument must be an instance of `Patient`' + # assert len(kwds) == 0, 'We do not take any key-word arguments' + # return self.score(args[0]) diff --git a/src/gpsea/analysis/pscore/stats/__init__.py b/src/gpsea/analysis/pscore/stats/__init__.py new file mode 100644 index 00000000..e46bfb5e --- /dev/null +++ b/src/gpsea/analysis/pscore/stats/__init__.py @@ -0,0 +1,7 @@ +from ._stats import PhenotypeScoreStatistic +from ._stats import MannWhitneyStatistic + +__all__ = [ + 'PhenotypeScoreStatistic', + 'MannWhitneyStatistic', +] diff --git a/src/gpsea/analysis/pscore/stats/_stats.py b/src/gpsea/analysis/pscore/stats/_stats.py new file mode 100644 index 00000000..01cfa4d9 --- /dev/null +++ b/src/gpsea/analysis/pscore/stats/_stats.py @@ -0,0 +1,44 @@ +import abc +import typing + +from scipy.stats import mannwhitneyu + + +class PhenotypeScoreStatistic(metaclass=abc.ABCMeta): + """ + `PhenotypeScoreStatistic` calculates a p value + for 2 or more phenotype score groups + computed by a :class:`~gpsea.analysis.pscore.PhenotypeScorer`. + """ + + @abc.abstractmethod + def compute_pval( + self, + scores: typing.Collection[typing.Sequence[float]], + ) -> float: + pass + + +class MannWhitneyStatistic(PhenotypeScoreStatistic): + """ + `MannWhitneyStatistic` is a wrapper around SciPy's + :func:`~scipy.stats.mannwhitneyu` function to apply + Mann-Whitney U rank test on 2 phenotype scores. + + See :ref:`phenotype-score-stats` for an example usage. + """ + + def compute_pval( + self, + scores: typing.Collection[typing.Sequence[float]], + ) -> float: + assert len(scores) == 2, 'Mann-Whitney U rank test only supports 2 categories at this time' + + x, y = scores + _, pval = mannwhitneyu( + x=x, + y=y, + alternative='two-sided', + ) + + return pval diff --git a/src/gpsea/model/_cohort.py b/src/gpsea/model/_cohort.py index 270ebc03..3209748e 100644 --- a/src/gpsea/model/_cohort.py +++ b/src/gpsea/model/_cohort.py @@ -107,7 +107,7 @@ def __hash__(self) -> int: return hash((self._labels, self._variants, self._phenotypes, self._diseases)) -class Cohort(typing.Sized): +class Cohort(typing.Sized, typing.Iterable[Patient]): @staticmethod def from_patients( @@ -141,7 +141,7 @@ def __init__( members: typing.Iterable[Patient], excluded_member_count: int, ): - self._patient_set = frozenset(members) + self._members = tuple(set(members)) self._excluded_count = excluded_member_count @property @@ -149,25 +149,25 @@ def all_patients(self) -> typing.Collection[Patient]: """ Get a collection of all patients in the cohort. """ - return self._patient_set + return self._members def all_phenotypes(self) -> typing.Set[Phenotype]: """ Get a set of all phenotypes (observed or excluded) in the cohort members. """ - return set(itertools.chain(phenotype for patient in self._patient_set for phenotype in patient.phenotypes)) + return set(itertools.chain(phenotype for patient in self._members for phenotype in patient.phenotypes)) def all_diseases(self) -> typing.Set[Disease]: """ Get a set of all diseases (observed or excluded) in the cohort members. """ - return set(itertools.chain(disease for patient in self._patient_set for disease in patient.diseases)) + return set(itertools.chain(disease for patient in self._members for disease in patient.diseases)) def all_variants(self) -> typing.Set[Variant]: """ Get a set of all variants observed in the cohort members. """ - return set(itertools.chain(variant for patient in self._patient_set for variant in patient.variants)) + return set(itertools.chain(variant for patient in self._members for variant in patient.variants)) @property def all_transcript_ids(self) -> typing.Set[str]: @@ -181,13 +181,13 @@ def total_patient_count(self): """ Get the total number of cohort members. """ - return len(self._patient_set) + return len(self._members) def get_patient_ids(self) -> typing.Set[str]: """ Get a set of the patient IDs. """ - return set(pat.patient_id for pat in self._patient_set) + return set(pat.patient_id for pat in self._members) def list_present_phenotypes( self, @@ -205,7 +205,7 @@ def list_present_phenotypes( number of patients with that phenotype) """ counter = Counter() - for patient in self._patient_set: + for patient in self._members: counter.update(p.identifier.value for p in patient.phenotypes if p.is_present) return counter.most_common(top) @@ -214,7 +214,7 @@ def list_all_diseases( top=None, ) -> typing.Sequence[typing.Tuple[hpotk.TermId, int]]: counter = Counter() - for patient in self._patient_set: + for patient in self._members: counter.update(d.identifier for d in patient.diseases) return counter.most_common(top) @@ -230,7 +230,7 @@ def list_all_variants( list: A sequence of tuples, formatted (variant key, number of patients with that variant) """ counter = Counter() - for patient in self._patient_set: + for patient in self._members: counter.update(variant.variant_info.variant_key for variant in patient.variants) return counter.most_common(top) @@ -246,7 +246,7 @@ def list_all_proteins( list: A list of tuples, formatted (protein ID string, the count of variants that affect the protein) """ counter = Counter() - for patient in self._patient_set: + for patient in self._members: counter.update(txa.protein_id for variant in patient.variants for txa in variant.tx_annotations) return counter.most_common(top) @@ -287,13 +287,16 @@ def get_variant_by_key(self, variant_key) -> Variant: raise ValueError(f"Variant key {variant_key} not found in cohort.") def __eq__(self, other): - return isinstance(other, Cohort) and self._patient_set == other._patient_set + return isinstance(other, Cohort) and self._members == other._members + + def __iter__(self) -> typing.Iterator[Patient]: + return iter(self._members) def __len__(self) -> int: - return len(self._patient_set) + return len(self._members) def __repr__(self): - return f'Cohort(members={self._patient_set}, excluded_count={self._excluded_count})' + return f'Cohort(members={self._members}, excluded_count={self._excluded_count})' def __str__(self): return repr(self) diff --git a/src/gpsea/model/_gt.py b/src/gpsea/model/_gt.py index 354d4b23..1940adbb 100644 --- a/src/gpsea/model/_gt.py +++ b/src/gpsea/model/_gt.py @@ -41,9 +41,11 @@ class Genotypes(typing.Sized, typing.Iterable): >>> b = SampleLabels('B') We can use one of the static methods to create an instance. Either a single genotype: + >>> gt = Genotypes.single(a, Genotype.HETEROZYGOUS) - Or genotypes of several samples: + or genotypes of several samples: + >>> gts = Genotypes.from_mapping({a: Genotype.HETEROZYGOUS, b: Genotype.HOMOZYGOUS_ALTERNATE}) There are 2 genotypes in the container: diff --git a/src/gpsea/model/_variant.py b/src/gpsea/model/_variant.py index e73e818e..ebc8ae07 100644 --- a/src/gpsea/model/_variant.py +++ b/src/gpsea/model/_variant.py @@ -439,8 +439,7 @@ def change_length(self) -> int: """ Get the change of length between the `ref` and `alt` alleles due to the variant presence. - SNVs lead to change length of zero, deletions and insertions/duplications lead to negative - and positive change lengths, respectively. + See :ref:`change-length-of-an-allele` for more info. """ return self._change_length @@ -869,9 +868,8 @@ def __init__( @property def variant_info(self) -> VariantInfo: """ - Returns: - VariantInfo: A representation of the variant data for sequence and symbolic variants, - as well as for large imprecise SVs. + Get the representation of the variant data for sequence and symbolic variants, + as well as for large imprecise SVs. """ return self._variant_info diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py index 91fc7856..67c14eab 100644 --- a/src/gpsea/preprocessing/_config.py +++ b/src/gpsea/preprocessing/_config.py @@ -359,7 +359,7 @@ def load_phenopackets( """ Map phenopacket JSON file into patient, validate the patient data, and assemble the patients into a cohort. - :param pp_directory: path to a folder with phenopacket JSON files. An error is raised if the path does not point to + :param phenopackets: path to a folder with phenopacket JSON files. An error is raised if the path does not point to a directory with at least one phenopacket. :param cohort_creator: cohort creator for turning a sequence of phenopacket into a :class:`~gpsea.model.Cohort`. diff --git a/src/gpsea/view/_stats.py b/src/gpsea/view/_stats.py index a7bdb1af..b232ad15 100644 --- a/src/gpsea/view/_stats.py +++ b/src/gpsea/view/_stats.py @@ -1,7 +1,10 @@ import typing +from collections import Counter + from jinja2 import Environment, PackageLoader -from gpsea.analysis import HpoMtcReport + +from gpsea.analysis.pcats import HpoTermAnalysisResult class MtcStatsViewer: @@ -16,45 +19,47 @@ def __init__(self): def process( self, - hpo_mtc_report: HpoMtcReport, + result: HpoTermAnalysisResult, ) -> str: """ - Create an HTML that should be shown with `display(HTML(..))` of the IPython package. + Create an HTML to present MTC part of the :class:`HpoTermAnalysisResult`. + + Use the `display(HTML(..))` functions of the IPython package. Args: - hpo_mtc_report (HpoMtcReport): summary of heuristic term filtering procedure + result (HpoTermAnalysisResult): the result to show Returns: str: an HTML string with parameterized template for rendering or writing into a standalone HTML file. """ - context = self._prepare_context(hpo_mtc_report) + assert isinstance(result, HpoTermAnalysisResult) + context = self._prepare_context(result) return self._cohort_template.render(context) @staticmethod def _prepare_context( - hpo_mtc_report: HpoMtcReport, + report: HpoTermAnalysisResult, ) -> typing.Mapping[str, typing.Any]: - results_map = hpo_mtc_report.skipped_terms_dict - if not isinstance(results_map, dict): - raise ValueError( - "hpo_mtc_report.skipped_terms_dict must be dictionary " - f"but was {type(hpo_mtc_report.skipped_terms_dict)}" - ) + counts = Counter() + for result in report.mtc_filter_results: + if result.is_filtered_out(): + counts[result.reason] += 1 n_skipped = 0 reason_to_count = list() - for reason, count in sorted(results_map.items(), key=lambda item: item[1], reverse=True): + for reason, count in sorted(counts.items(), key=lambda item: item[1], reverse=True): reason_to_count.append({"reason": reason, "count": count}) n_skipped += count - n_tested = hpo_mtc_report.n_terms_before_filtering - n_skipped + n_all = len(report.phenotypes) + n_tested = n_all - n_skipped # The following dictionary is used by the Jinja2 HTML template return { - "mtc_name": hpo_mtc_report.mtc_method, - "hpo_mtc_filter_name": hpo_mtc_report.filter_method, + "mtc_name": report.mtc_name, + "hpo_mtc_filter_name": report.mtc_filter_name, "skipped_hpo_count": n_skipped, "tested_hpo_count": n_tested, - "total_hpo_count": hpo_mtc_report.n_terms_before_filtering, + "total_hpo_count": n_all, "reason_to_count": reason_to_count, } diff --git a/tests/analysis/pcats/__init__.py b/tests/analysis/pcats/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/analysis/pcats/test_disease.py b/tests/analysis/pcats/test_disease.py new file mode 100644 index 00000000..48acece3 --- /dev/null +++ b/tests/analysis/pcats/test_disease.py @@ -0,0 +1,50 @@ +import hpotk +import pytest + +from gpsea.model import Cohort + +from gpsea.analysis.pcats import DiseaseAnalysis +from gpsea.analysis.pcats.stats import CountStatistic, ScipyFisherExact +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate +from gpsea.analysis.predicate.phenotype import DiseasePresencePredicate + + +class TestDiseaseAnalysis: + + @pytest.fixture(scope='class') + def count_statistic(self) -> CountStatistic: + return ScipyFisherExact() + + @pytest.fixture(scope='class') + def suox_disease(self) -> DiseasePresencePredicate: + sulfite_oxidase_deficiency = hpotk.TermId.from_curie('OMIM:272300') + return DiseasePresencePredicate( + disease_id_query=sulfite_oxidase_deficiency, + ) + + @pytest.fixture + def analysis( + self, + count_statistic: CountStatistic, + ) -> DiseaseAnalysis: + return DiseaseAnalysis( + count_statistic=count_statistic, + mtc_correction='fdr_bh', + mtc_alpha=.05, + ) + + def test_compare_genotype_vs_phenotypes( + self, + analysis: DiseaseAnalysis, + suox_cohort: Cohort, + suox_gt_predicate: GenotypePolyPredicate, + suox_disease: DiseasePresencePredicate, + ): + results = analysis.compare_genotype_vs_phenotypes( + cohort=suox_cohort.all_patients, + gt_predicate=suox_gt_predicate, + pheno_predicates=(suox_disease,), + ) + + assert results is not None + # TODO: improve testing diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py new file mode 100644 index 00000000..516bf9fc --- /dev/null +++ b/tests/analysis/pcats/test_hpo_term_analysis.py @@ -0,0 +1,58 @@ +import typing + +import hpotk +import pytest + +from gpsea.model import Cohort + +from gpsea.analysis.mtc_filter import PhenotypeMtcFilter, HpoMtcFilter +from gpsea.analysis.pcats import HpoTermAnalysis +from gpsea.analysis.pcats.stats import CountStatistic, ScipyFisherExact +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate +from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate + + +class TestHpoTermAnalysis: + + @pytest.fixture(scope='class') + def count_statistic(self) -> CountStatistic: + return ScipyFisherExact() + + @pytest.fixture(scope='class') + def phenotype_mtc_filter( + self, + hpo: hpotk.MinimalOntology, + ) -> PhenotypeMtcFilter: + return HpoMtcFilter.default_filter( + hpo=hpo, + term_frequency_threshold=.2, + ) + + @pytest.fixture + def analysis( + self, + count_statistic: CountStatistic, + phenotype_mtc_filter: PhenotypeMtcFilter, + ) -> HpoTermAnalysis: + return HpoTermAnalysis( + count_statistic=count_statistic, + mtc_filter=phenotype_mtc_filter, + mtc_correction='fdr_bh', + mtc_alpha=.05, + ) + + def test_compare_genotype_vs_phenotypes( + self, + analysis: HpoTermAnalysis, + suox_cohort: Cohort, + suox_gt_predicate: GenotypePolyPredicate, + suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]] + ): + results = analysis.compare_genotype_vs_phenotypes( + cohort=suox_cohort.all_patients, + gt_predicate=suox_gt_predicate, + pheno_predicates=suox_pheno_predicates, + ) + + assert results is not None + # TODO: improve testing diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index eb8a0ca6..3e71283b 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -1,8 +1,8 @@ import pytest from gpsea.model import * -from gpsea.analysis.predicate import GenotypePolyPredicate from gpsea.analysis.predicate.genotype import ( + GenotypePolyPredicate, groups_predicate, VariantPredicates, ) diff --git a/tests/analysis/test_analysis.py b/tests/analysis/test_analysis.py index a6888300..5db24621 100644 --- a/tests/analysis/test_analysis.py +++ b/tests/analysis/test_analysis.py @@ -14,9 +14,9 @@ from gpsea.model import * from gpsea.model.genome import * -from gpsea.analysis.predicate import GenotypePolyPredicate, PatientCategories +from gpsea.analysis.predicate import PatientCategories +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicate from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate -from gpsea.analysis.predicate.genotype import VariantPredicate def test_apply_predicates_on_patients( diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index 1eb4d8e8..0b060516 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -5,8 +5,10 @@ import pandas as pd import pytest -from gpsea.analysis import HpoMtcFilter, SpecifiedTermsMtcFilter, apply_predicates_on_patients -from gpsea.analysis.predicate import PatientCategories, GenotypePolyPredicate +from gpsea.analysis import apply_predicates_on_patients +from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter +from gpsea.analysis.predicate import PatientCategories +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate from gpsea.model import Cohort @@ -154,7 +156,7 @@ def test_filter_terms_to_test( filtered_n_usable, filtered_all_counts, reason_for_filtering_out = mtc_report assert reason_for_filtering_out['Skipping term because all genotypes have same HPO observed proportions'] == 1 - assert reason_for_filtering_out['Skipping general term'] == 14 + assert reason_for_filtering_out['Skipping general term'] == 16 assert reason_for_filtering_out['Skipping non-target term'] == 5 assert reason_for_filtering_out['Skipping top level term'] == 0 diff --git a/tests/analysis/predicate/phenotype/test_predicate.py b/tests/analysis/test_pscore.py similarity index 97% rename from tests/analysis/predicate/phenotype/test_predicate.py rename to tests/analysis/test_pscore.py index 345fa016..9b632902 100644 --- a/tests/analysis/predicate/phenotype/test_predicate.py +++ b/tests/analysis/test_pscore.py @@ -4,7 +4,7 @@ import pytest from gpsea.model import Patient, Phenotype, SampleLabels -from gpsea.analysis.predicate.phenotype import CountingPhenotypeScorer +from gpsea.analysis.pscore import CountingPhenotypeScorer class TestCountingPhenotypeScorer: diff --git a/tests/conftest.py b/tests/conftest.py index c3d6387b..54f012ec 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,8 +5,7 @@ import hpotk import pytest -from gpsea.analysis.predicate import GenotypePolyPredicate -from gpsea.analysis.predicate.genotype import VariantPredicates, VariantPredicate, boolean_predicate +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, boolean_predicate from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate from gpsea.io import GpseaJSONEncoder, GpseaJSONDecoder from gpsea.model import * diff --git a/tests/test_tutorial.py b/tests/test_tutorial.py new file mode 100644 index 00000000..52924eff --- /dev/null +++ b/tests/test_tutorial.py @@ -0,0 +1,90 @@ +import pytest +import ppktstore +import hpotk + +from gpsea.model import Cohort, VariantEffect +from gpsea.preprocessing import configure_caching_cohort_creator, CohortCreator, load_phenopackets +from gpsea.analysis.pcats import HpoTermAnalysis +from gpsea.analysis.mtc_filter import HpoMtcFilter +from gpsea.analysis.pcats.stats import ScipyFisherExact +from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate +from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest + + +class TestTutorial: + + @pytest.fixture(scope='class') + def cohort_creator( + self, + hpo: hpotk.MinimalOntology, + ) -> CohortCreator: + return configure_caching_cohort_creator( + hpo=hpo, + ) + + @pytest.fixture(scope='class') + def cohort( + self, + cohort_creator: CohortCreator, + ) -> Cohort: + registry = ppktstore.registry.configure_phenopacket_registry() + with registry.open_phenopacket_store('0.1.18') as ps: + phenopackets = tuple(ps.iter_cohort_phenopackets('TBX5')) + + cohort, _ = load_phenopackets( + phenopackets=phenopackets, + cohort_creator=cohort_creator, + ) + return cohort + + @pytest.fixture + def mtc_filter( + self, + hpo: hpotk.MinimalOntology, + ) -> HpoMtcFilter: + return HpoMtcFilter.default_filter( + hpo=hpo, + term_frequency_threshold=0.2, + ) + + @pytest.fixture + def hpo_term_analysis( + self, + mtc_filter, + ) -> HpoTermAnalysis: + return HpoTermAnalysis( + count_statistic=ScipyFisherExact(), + mtc_filter=mtc_filter, + mtc_correction='fdr_bh', + mtc_alpha=0.05, + ) + + @pytest.mark.skip('Just for manual debugging') + def test_compare_genotype_vs_phenotype( + self, + hpo: hpotk.MinimalOntology, + cohort: Cohort, + hpo_term_analysis: HpoTermAnalysis, + ): + tx_id = 'NM_181486.4' + + gt_predicate = groups_predicate( + predicates=( + VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), + VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), + ), + group_names=('Missense', 'Frameshift',), + ) + pheno_predicates = prepare_predicates_for_terms_of_interest( + cohort=cohort, + hpo=hpo, + min_n_of_patients_with_term=2, + ) + + result = hpo_term_analysis.compare_genotype_vs_phenotypes( + cohort=cohort, + gt_predicate=gt_predicate, + pheno_predicates=pheno_predicates, + ) + + assert result is not None diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py index 0cdacc3a..691f4f6a 100644 --- a/tests/view/test_stats.py +++ b/tests/view/test_stats.py @@ -1,11 +1,52 @@ +import hpotk +import math import pytest -from gpsea.analysis import HpoMtcReport +import pandas as pd + +from gpsea.analysis.pcats import HpoTermAnalysisResult +from gpsea.analysis.predicate import PatientCategories +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate +from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.view import MtcStatsViewer class TestStatsViewable: + @pytest.fixture(scope='class') + def hpo_term_analysis_result( + self, + suox_gt_predicate: GenotypePolyPredicate, + ) -> HpoTermAnalysisResult: + return HpoTermAnalysisResult( + phenotypes=( + hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly + hpotk.TermId.from_curie('HP:0001250'), # Seizure + ), + n_usable=(40, 20), + all_counts=( + pd.DataFrame( + data=[[10, 5], [10, 15]], + index=pd.Index((PatientCategories.YES, PatientCategories.NO,)), + columns=pd.Index(suox_gt_predicate.get_categories()) + ), + pd.DataFrame( + data=[[5, 0], [5, 10]], + index=pd.Index((PatientCategories.YES, PatientCategories.NO,)), + columns=pd.Index(suox_gt_predicate.get_categories()) + ), + ), + pvals=(math.nan, .005,), + corrected_pvals=(math.nan, .01), + gt_predicate=suox_gt_predicate, + mtc_filter_name='Random MTC filter', + mtc_filter_results=( + PhenotypeMtcResult.fail("Not too interesting"), + PhenotypeMtcResult.ok(), + ), + mtc_name='fdr_bh', + ) + @pytest.fixture def stats_viewer(self) -> MtcStatsViewer: return MtcStatsViewer() @@ -14,20 +55,8 @@ def stats_viewer(self) -> MtcStatsViewer: def test_process( self, stats_viewer: MtcStatsViewer, + hpo_term_analysis_result: HpoTermAnalysisResult ): - mtc_report = HpoMtcReport( - filter_name='identity filter', - mtc_name='bonferroni', - filter_results_map={ - # The reason for skipping a phenotype -> the number of phenotypes skipped for the reason - 'I slept bad tonight': 0, - 'The sun is too bright today': 5, - 'Life is a conspiracy': 80, - 'I need coffee': 7, - }, - n_terms_before_filtering=100, # The filtered out (80 + 7 + 5) + the unfiltered - ) - - report = stats_viewer.process(hpo_mtc_report=mtc_report) + report = stats_viewer.process(result=hpo_term_analysis_result) with open('mtc_stats.html', 'w') as fh: fh.write(report)