Optimizations

MOE-MONEY-MOE-PROBLEMS · Jan 12, 2023 · 414a366 · 414a366
1 parent d17e6c1
commit 414a366
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 10 deletions.
diff --git a/python/PascalX/genescorer.py b/python/PascalX/genescorer.py
@@ -133,6 +133,7 @@ def load_genome(self,file,ccol=1,cid=0,csymb=5,cstx=2,cetx=3,cs=4,cb=None,chrSta
         GEN = genome.genome()
         GEN.load_genome(file,ccol,cid,csymb,cstx,cetx,cs,cb,chrStart,splitchr,NAgeneid,useNAgenes,header)
 
+        self._GENOME = GEN
         self._GENEID = GEN._GENEID
         self._GENESYMB = GEN._GENESYMB
         self._GENEIDtoSYMB = GEN._GENEIDtoSYMB
@@ -142,7 +143,7 @@ def load_genome(self,file,ccol=1,cid=0,csymb=5,cstx=2,cetx=3,cs=4,cb=None,chrSta
         self._SKIPPED = GEN._SKIPPED
 
 
-    def load_mapping(self,file,gcol=0,rcol=1,wcol=None,delimiter="\t",a1col=None,a2col=None,bcol=None,pfilter=1,header=False,joint=True):
+    def load_mapping(self,file,gcol=0,rcol=1,wcol=None,delimiter="\t",a1col=None,a2col=None,bcol=None,pfilter=1,header=False,joint=True,symbol=False):
         """
         Loads a SNP to gene mapping
         
@@ -159,17 +160,22 @@ def load_mapping(self,file,gcol=0,rcol=1,wcol=None,delimiter="\t",a1col=None,a2c
             pfilter(float): Only include rows with wcol < pfilter
             header(bool): Header present
             joint(bool): Use mapping SNPs and gene window based SNPs
+            symbol(bool): True: Gene id is a gene symbol; False: Gene id is an ensembl gene id
             
         Note:
+        
             * A loaded mapping takes precedence over a loaded positional gene annotation
             * The mapping data is stored statically (same for all class initializations)
             
         """
-        M = mapper()
-        M.load_mapping(file,gcol,rcol,wcol,a1col,a2col,bcol,delimiter,pfilter,header)
-        self._MAP = M._GENEIDtoSNP
-        self._iMAP = M._SNPtoGENEID
-        self._joint = joint
+        if symbol and self._GENOME is None:
+            print("For symbol==True a genome has to be loaded first (use .load_genome)")
+        else:
+            M = mapper(self._GENOME)
+            M.load_mapping(file,gcol,rcol,wcol,a1col,a2col,bcol,delimiter,pfilter,header,symbol)
+            self._MAP = M._GENEIDtoSNP
+            self._iMAP = M._SNPtoGENEID
+            self._joint = joint
 
     def load_GWAS(self,file,rscol=0,pcol=1,bcol=None,a1col=None,a2col=None,delimiter=None,header=False,NAid='NA',log10p=False,cutoff=1e-300):
         """

diff --git a/python/PascalX/mapper.py b/python/PascalX/mapper.py
@@ -21,12 +21,14 @@
 
 
 class mapper:
-    def __init__(self):
+    def __init__(self,genome=None):
         self._GENEIDtoSNP = {}
         self._SNPtoGENEID = {}
         self._GENEDATA = {}
-
-    def load_mapping(self,file,gcol=0,rcol=1,wcol=None,a1col=None,a2col=None,bcol=None,delimiter="\t",pfilter=1,header=False):
+
+        self._GENOME = genome
+
+    def load_mapping(self,file,gcol=0,rcol=1,wcol=None,a1col=None,a2col=None,bcol=None,delimiter="\t",pfilter=1,header=False,symbol=False):
         """
         Loads a SNP to gene mapping
         
@@ -42,6 +44,7 @@ def load_mapping(self,file,gcol=0,rcol=1,wcol=None,a1col=None,a2col=None,bcol=No
             delimiter(string): Character used to separate columns
             header(bool): Header present
             pfilter(float): Only include rows with wcol < pfilter
+            symbol(bool): Gene id are gene symbols (requires genome to be set on init)
         """
         self._GENEIDtoSNP = {}
         self._SNPtoGENEID = {}
@@ -67,6 +70,12 @@ def load_mapping(self,file,gcol=0,rcol=1,wcol=None,a1col=None,a2col=None,bcol=No
 
                 gid = line[gcol]
 
+                if symbol:
+                    if gid in self._GENOME._GENESYMB:
+                        gid = self._GENOME._GENESYMB[gid]
+                    else:
+                        continue
+
                 if gid not in self._GENEIDtoSNP:
                     self._GENEIDtoSNP[gid] = {} #[[],[],[],[],[]]
 

diff --git a/python/wchissum.cpp b/python/wchissum.cpp
@@ -676,6 +676,55 @@ double oneminwchissum_m1nc0_satterthwaite_1000d(double* lambda, int N, double X)
     return x.convert_to<double>(); 
 }
 
+
+extern "C"
+double oneminwchissum_m1nc0_satterthwaite_auto(double* lambda, int N, double X) {
+
+    // Calc g,h
+    double sum1 = 0;
+    double sum2 = 0;
+    for(int i = 0; i < N; i++) {
+        sum1 += lambda[i];
+        sum2 += lambda[i]*lambda[i];
+    }
+
+    double g = sum2/sum1;
+    double h = sum1*sum1/sum2;
+
+    gamma_distribution<double> gamma(h/2,2*g);
+
+    double res =  1. - cdf(gamma,X);
+
+    if (res < 1e-15) {
+        float128 x = float128(X);
+
+        gamma_distribution<float128> gamma(h/2,2*g); 
+        res = ( 1. - cdf(gamma,x) ).convert_to<double>();
+
+        if (res < 1e-32) {
+            number<cpp_bin_float<100>> x(X);
+            gamma_distribution<number<cpp_bin_float<100>>> gamma(h/2,2*g); 
+            res = ( number<cpp_bin_float<100>>(1.) - cdf(gamma,x) ).convert_to<double>();
+
+            if (res < 1e-98) {
+                number<cpp_bin_float<200>> x(X);
+                gamma_distribution<number<cpp_bin_float<200>>> gamma(h/2,2*g); 
+                res = ( number<cpp_bin_float<200>>(1.) - cdf(gamma,x) ).convert_to<double>();
+
+                if (res < 1e-195) {
+                     number<cpp_bin_float<300>> x(X);
+                     gamma_distribution<number<cpp_bin_float<300>>> gamma(h/2,2*g); 
+                     res = ( number<cpp_bin_float<300>>(1.) - cdf(gamma,x) ).convert_to<double>();   
+                }
+            }
+        }
+    }
+
+    // Calc and return solution
+    return res;
+}
+
+/*
 extern "C"
 double oneminwchissum_m1nc0_satterthwaite_auto(double* lambda, int N, double X) {
     double res = oneminwchissum_m1nc0_satterthwaite(lambda,N,X);
@@ -698,6 +747,7 @@ double oneminwchissum_m1nc0_satterthwaite_auto(double* lambda, int N, double X)
     
     return res;
 }
+*/
 
 /*
     Pearson's approximation
@@ -771,6 +821,7 @@ double oneminwchissum_m1nc0_pearson_100d(double* lambda, int N, double X) {
 
     double h = c2*c2*c2/(c3*c3);
     double y = std::max(0.,(X-c1)*sqrt(h/c2)+h);
+
     number<cpp_bin_float<100>> Y(y);
 
     // Calc cdf  
@@ -878,6 +929,60 @@ double oneminwchissum_m1nc0_pearson_1000d(double* lambda, int N, double X) {
     return x.convert_to<double>(); 
 }
 
+extern "C"
+double oneminwchissum_m1nc0_pearson_auto(double* lambda, int N, double X) {
+    double c1 = 0;
+    double c2 = 0;
+    double c3 = 0;
+
+    for(int i = 0; i < N; i++) {
+        c1 += lambda[i];
+        c2 += lambda[i]*lambda[i];
+        c3 += lambda[i]*lambda[i]*lambda[i];
+    }
+
+    double h = c2*c2*c2/(c3*c3);
+    double y = std::max(0.,(X-c1)*sqrt(h/c2)+h);
+
+    chi_squared chisq(h);
+
+    double res = 1. - cdf(chisq,y);
+
+    if (res < 1e-15) {
+        float128 Y(y);
+
+        chi_squared_distribution<float128> chisq(h);
+        res =  ( 1. - cdf(chisq,Y) ).convert_to<double>();
+
+        if (res < 1e-32) {
+            number<cpp_bin_float<1000>> Y(y);
+            chi_squared_distribution<number<cpp_bin_float<100>>> chisq(h);
+
+            res =  ( 1. - cdf(chisq,Y) ).convert_to<double>();
+
+            if (res < 1e-98) {
+                number<cpp_bin_float<200>> Y(y);
+                chi_squared_distribution<number<cpp_bin_float<200>>> chisq(h);
+
+                res =  ( 1. - cdf(chisq,Y) ).convert_to<double>();
+
+                if (res < 1e-195) {
+                    number<cpp_bin_float<300>> Y(y);
+                    chi_squared_distribution<number<cpp_bin_float<300>>> chisq(h);
+
+                    res =  ( 1. - cdf(chisq,Y) ).convert_to<double>();
+                }
+            }
+        }
+    }
+
+    // Calc and return solution
+    return res;
+}
+
+
+
+/*
 extern "C"
 double oneminwchissum_m1nc0_pearson_auto(double* lambda, int N, double X) {
     double res = oneminwchissum_m1nc0_pearson(lambda,N,X);
@@ -900,7 +1005,7 @@ double oneminwchissum_m1nc0_pearson_auto(double* lambda, int N, double X) {
     
     return res;
 }
-
+*/
 /*
     Saddle-point approximation
     (central case)