forked from lasersonlab/phip-stat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
counts2pvals.py
executable file
·156 lines (135 loc) · 5.66 KB
/
counts2pvals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#! /usr/bin/env python
# Copyright 2014 Uri Laserson
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import argparse
import numpy as np
import scipy as sp
import scipy.optimize
argparser = argparse.ArgumentParser(description=None)
argparser.add_argument('-i','--input',required=True)
argparser.add_argument('-o','--output',required=True)
args = argparser.parse_args()
inhandle = open(args.input,'r')
outhandle = open(args.output,'w')
lt1 = 1. - np.finfo(np.float64).epsneg
def GP_lambda_likelihood(counts):
# compute inputs to likelihood function
(nx,x) = np.histogram(counts,bins=range(max(counts)+2))
x = x[:-1]
n = len(counts)
x_bar = sum(counts) / float(n)
# check condition for unique root
if sum(nx[2:]*x[2:]*(x[2:]-1)) - n*(x_bar**2) <= 0:
sys.stderr.write("Condition for uniqueness of lambda is not met.\n x: %s\n n: %s\n x_bar: %s\n" % (x,n,x_bar)); sys.stderr.flush()
raise ValueError
return lambda lam: sum(nx*(x*(x-1)/(x_bar+(x-x_bar)*lam))) - n*x_bar
def log_GP_pmf(x,theta,lambd):
log = np.log
logP = log(theta) + (x-1)*log(theta+x*lambd) - (theta+x*lambd) - np.sum(log(np.arange(1,x+1)))
return logP
def log_GP_sf(x,theta,lambd):
extensions = 20
start = x + 1
end = x + 100
pmf = [log_GP_pmf(y,theta,lambd) for y in xrange(start,end)]
while extensions > 0:
accum = np.logaddexp.accumulate( pmf )
if accum[-1] == accum[-2]: return accum[-1]
start = end
end += 100
pmf += [log_GP_pmf(y,theta,lambd) for y in xrange(start,end)]
extensions -= 1
# raise ValueError
return np.nan
# Load data
sys.stderr.write("Loading data...\n"); sys.stderr.flush()
clones = []
input_counts = []
output_counts = []
for line in inhandle:
if line.startswith('#'): continue
data = line.split(',')
clones.append( data[0].strip() )
input_counts.append( int(data[1]) )
output_counts.append( np.int_(data[2:]) )
input_counts = np.asarray(input_counts)
output_counts = np.asarray(output_counts) + 1 # pseudocounts to combat negative regressed theta
uniq_input_values = list(set(input_counts))
sys.stderr.write("Num clones: %s\nInput vec shape: %s\nOutput array shape: %s\n" % (len(clones),input_counts.shape,output_counts.shape)); sys.stderr.flush()
# Estimate generalized Poisson distributions for every input count
sys.stderr.write("Computing lambdas and thetas for %i different input values...\n" % len(uniq_input_values)); sys.stderr.flush()
lambdas = []
thetas = []
idxs = []
for i in xrange(output_counts.shape[1]): # for each output column...
sys.stderr.write(" working on output column %i\n" % i); sys.stderr.flush()
lambdas.append([])
thetas.append([])
idxs.append([])
for input_value in uniq_input_values: # ...compute lambdas/thetas
# compute lambda
curr_counts = output_counts[input_counts == input_value,i]
if len(curr_counts) < 50:
continue
try: # may fail if MLE doesn't meet uniqueness condition
H = GP_lambda_likelihood(curr_counts)
except ValueError:
continue
idxs[-1].append(input_value)
lambd = sp.optimize.brentq(H, 0., lt1)
lambdas[-1].append( lambd )
# compute theta
n = len(curr_counts)
x_bar = sum(curr_counts) / float(n)
theta = x_bar * (1 - lambd)
thetas[-1].append( theta )
# Regression on all of the theta and lambda values computed
sys.stderr.write("Regression on lambdas and thetas...\n"); sys.stderr.flush()
lambda_fits = []
theta_fits = []
for i in xrange(output_counts.shape[1]):
sys.stderr.write(" working on output column %i\n" % i); sys.stderr.flush()
lambda_fit = lambda x: np.mean(lambdas[i])
coeffs = np.polyfit(idxs[i],thetas[i],1)
theta_fit = lambda x: coeffs[0]*x + coeffs[1]
lambda_fits.append(lambda_fit)
theta_fits.append(theta_fit)
# Precompute CDF for possible input-output combinations
sys.stderr.write("Precomputing pval combos..."); sys.stderr.flush()
uniq_combos = []
for i in xrange(output_counts.shape[1]):
uniq_combos.append( set(zip(input_counts,output_counts[:,i])) )
sys.stderr.write("computing %i combos\n" % sum([len(u) for u in uniq_combos])); sys.stderr.flush()
log10pval_hash = {}
j = 0
for (i,u) in enumerate(uniq_combos):
for (ic,oc) in u:
if j % 1000 == 0: sys.stderr.write("...computed %i p-vals\n" % j); sys.stderr.flush()
log_pval = log_GP_sf(oc,theta_fits[i](ic),lambda_fits[i](ic))
log10pval_hash[(i,ic,oc)] = log_pval * np.log10( np.e ) * -1.
# pval = GP_sf(oc,theta_fits[i](ic),lambda_fits[i](ic))
# log10pval_hash[(i,ic,oc)] = -np.log10(pval)
j += 1
# Compute p-values for each clone using regressed GP parameters
sys.stderr.write("Computing actual pvals...\n"); sys.stderr.flush()
for (clone,ic,ocs) in zip(clones,input_counts,output_counts):
output_string = clone
for (i,oc) in enumerate(ocs): output_string += ",%f" % log10pval_hash[(i,ic,oc)]
print >>outhandle, output_string
# DEBUG
# logpvals = np.asarray(logpvals)
# clones = np.asarray(clones)
# for (c,p) in zip(clones[logpvals>6],logpvals[logpvals>8]):
# print "%s\t%s" % (c,p)