diff --git a/pascalx b/pascalx index 2c3f168..a304908 100755 --- a/pascalx +++ b/pascalx @@ -40,7 +40,8 @@ parser_genescoring.add_argument("gwas",help="tab separated file with GWAS data ( parser_genescoring.add_argument("-sh","--skip_head",type=lambda x: (str(x).lower() == 'true'),default=True,help="first line is header [True|False], default=True") parser_genescoring.add_argument("-cr","--col_rsid",type=int,default=0,help="column with rsids, default=0") parser_genescoring.add_argument("-cp","--col_pval",type=int,default=1,help="column with p-values, default=1") - +parser_genescoring.add_argument("-m","--method",type=str,default='saddle',help="method for gene scoring [saddle|auto|pearson|satterthwaite|ruben|davies], default=saddle") +parser_genescoring.add_argument("-mr","--rescore",type=bool,default=True,help="Rescore failed genes with backup method [True|False]") # X-scoring parser_xscoring = subparsers.add_parser('xscoring',description="Xscorer") @@ -114,9 +115,9 @@ if __name__ == "__main__": print("Starting genescoring") if args.chr == 'all': - R = G.score_all(parallel=args.parallel,nobar=args.nobar) + R = G.score_all(parallel=args.parallel,nobar=args.nobar,method=args.method,autorescore=args.rescore) else: - R = G.score_chr(chrs=list(ast.literal_eval(args.chr)),parallel=args.parallel,nobar=args.nobar) + R = G.score_chr(chrs=list(ast.literal_eval(args.chr)),parallel=args.parallel,nobar=args.nobar,method=args.method,autorescore=args.rescore) if args.pathway is None: G.save_scores(args.outfile+".tsv") @@ -157,9 +158,9 @@ if __name__ == "__main__": P = pathway.chi2rank(G) M = P.load_modules(args.pathway,args.col_name,args.col_symb) if args.chr == 'all': - R = P.score(M,parallel=args.parallel,nobar=args.nobar,genes_only=args.genes_only) + R = P.score(M,parallel=args.parallel,nobar=args.nobar,genes_only=args.genes_only,method=args.method,autorescore=args.rescore) else: - R = P.score(M,parallel=args.parallel,nobar=args.nobar,chrs_only=args.chr,genes_only=args.genes_only) + R = P.score(M,parallel=args.parallel,nobar=args.nobar,chrs_only=args.chr,genes_only=args.genes_only,method=args.method,autorescore=args.rescore) if args.genes_only: G.save_scores(args.outfile+".tsv") diff --git a/python/PascalX/genescorer.py b/python/PascalX/genescorer.py index 090d589..17e55fd 100755 --- a/python/PascalX/genescorer.py +++ b/python/PascalX/genescorer.py @@ -899,7 +899,7 @@ def _scoremain(self,gene,unloadRef=False,method='saddle',mode='auto',reqacc=1e-1 if G[i] in self._GENEID: with lock: - pbar.set_postfix_str(str(self._GENEID[G[i]][4])) + pbar.set_postfix_str(str(self._GENEID[G[i]][4]).ljust(15)) cr = self._GENEID[G[i]][0] @@ -963,7 +963,7 @@ def _scoremain(self,gene,unloadRef=False,method='saddle',mode='auto',reqacc=1e-1 with lock: - pbar.set_postfix_str("done") + pbar.set_postfix_str("done".ljust(15)) pbar.close() return RESULT,FAIL,TOTALFAIL @@ -985,6 +985,21 @@ def score(self,gene,parallel=1,unloadRef=False,method='saddle',mode='auto',reqac autorescore(bool): Automatically try to re-score failed genes via Pearson's algorithm """ + + # Check method + methods = ['auto','saddle','pearson','satterthwaite','ruben','davies'] + + if method not in methods: + print("No valid scoring method set. Available methods:",methods) + return None + else: + print("Scoring with method",method) + + if type(autorescore) is str and autorescore not in methods: + print("No valid scoring method set for rescoring. Available methods:",methods) + return None + + G = [] for i in range(0,len(gene)): @@ -1004,12 +1019,13 @@ def score(self,gene,parallel=1,unloadRef=False,method='saddle',mode='auto',reqac S = np.array_split(G,parallel) result_objs = [] - pool = mp.Pool(max(1,min(parallel,mp.cpu_count()))) + pool = mp.Pool(max(1,min(len(S),mp.cpu_count()))) for i in range(0,len(S)): - - result = pool.apply_async(self._scoremain, (S[i],True,method,mode,reqacc,intlimit,'',i,nobar,lock,keep_idx)) - result_objs.append(result) + + if len(S[i]) > 0: + result = pool.apply_async(self._scoremain, (S[i],True,method,mode,reqacc,intlimit,'',i,nobar,lock,keep_idx)) + result_objs.append(result) results = [result.get() for result in result_objs] @@ -1031,9 +1047,21 @@ def score(self,gene,parallel=1,unloadRef=False,method='saddle',mode='auto',reqac for X in R[0]: self._SCORES[X[0]] = float(X[1]) - if autorescore and len(R[1]) > 0: - print("Rescoreing failed genes") - R = self.rescore(R,method='pearson',mode='auto',reqacc=1e-100,intlimit=10000000,parallel=parallel,nobar=nobar,keep_idx=keep_idx) + if (len(R[1]) > 0 and + ( + (type(autorescore)==bool and autorescore) + or autorescore in methods + ) + ): + + if type(autorescore)==bool: + method = 'pearson' + else: + method = autorescore + + print("Rescoreing failed genes with method",method) + + R = self.rescore(R,method=method,mode='auto',reqacc=1e-100,intlimit=10000000,parallel=parallel,nobar=nobar,keep_idx=keep_idx) if len(R[1])>0: print(len(R[1]),"genes failed to be scored") @@ -1091,14 +1119,15 @@ def rescore(self,RESULT,method='pearson',mode='auto',reqacc=1e-100,intlimit=1000 RES = [[],[],[]] S = np.array_split(G,parallel) - pool = mp.Pool(max(1,min(parallel,mp.cpu_count()))) + pool = mp.Pool(max(1,min(len(S),mp.cpu_count()))) result_objs = [] for i in range(0,len(S)): - result = pool.apply_async(self._scoremain, (S[i],True,method,mode,reqacc,intlimit,'',i,nobar,lock,keep_idx)) - result_objs.append(result) + if len(S[i]) > 0: + result = pool.apply_async(self._scoremain, (S[i],True,method,mode,reqacc,intlimit,'',i,nobar,lock,keep_idx)) + result_objs.append(result) results = [result.get() for result in result_objs] @@ -1116,6 +1145,9 @@ def rescore(self,RESULT,method='pearson',mode='auto',reqacc=1e-100,intlimit=1000 for X in RES[0]: self._SCORES[X[0]] = float(X[1]) + print(len(RES[0]),"genes successfully rescored.") + if len(RES[1]) > 0: + print(len(RES[1]),"genes still failed.") RESULT[2].extend(RES[2]) RESULT[1].clear() @@ -1158,7 +1190,9 @@ def score_chr(self,chrs,unloadRef=False,method='saddle',mode='auto',reqacc=1e-10 res = self.score(G,parallel,unloadRef,method,mode,reqacc,intlimit,nobar,autorescore,keep_idx) toc = time.time() - print("[time]:",str(round(toc-tic,2))+"s;",round(len(G)/(toc-tic),2),"genes/s") + + if res is not None: + print("[time]:",str(round(toc-tic,2))+"s;",round(len(G)/(toc-tic),2),"genes/s") return res