-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunMetaBootThreaded.jl
145 lines (103 loc) · 3.82 KB
/
runMetaBootThreaded.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
using MixedModels, CSV, DataFrames, LinearAlgebra, Statistics, GLM, Suppressor
LinearAlgebra.BLAS.set_num_threads(1)
drug = ARGS[1]
tissue = ARGS[2]
gene = ARGS[3]
filePath = ARGS[4]
outFileName = ARGS[5]
# nthread = 1::Int64 #40 threads faster than 80 on niagara
modelData = DataFrame(CSV.File(filePath, pool=false));
select!(modelData, Not(:Column1));
R = min(modelData[1,:R], 10000000);
select!(modelData, Not(:R));
function scale(x::Array{Float64,1})::Array{Float64,1}
return (x .- mean(x))/std(x)
end
function sampleWithinDataset(modelData::DataFrame, dataset)::DataFrame
myDS = findall(modelData[!,:dataset].==dataset);
nDS = length(myDS);
myx = rand(1:nDS, nDS);
myDS = myDS[myx];
datasetData = modelData[myDS,:];
if any(names(datasetData).=="tissueid")
tissues = unique(modelData[!,:tissueid])
for tissue = tissues
datasetData=removeTissueMean!(datasetData, tissue);
end
end
datasetData[!,:x] = scale(datasetData[!,:x]);
datasetData[!,:y] = scale(datasetData[!,:y]);
return datasetData
end
function removeTissueMean!(datasetData::DataFrame, tissue)::DataFrame
myTissue = findall(datasetData[!,:tissueid].==tissue);
tissueMeanX = mean(datasetData[myTissue,:x]::Array{Float64,1});
tissueMeanY = mean(datasetData[myTissue,:y]::Array{Float64,1});
datasetData[myTissue,:x] = datasetData[myTissue,:x] .- tissueMeanX;
datasetData[myTissue,:y] = datasetData[myTissue, :y] .- tissueMeanY;
return datasetData
end
function scaleWithinDataset(modelData::DataFrame, dataset)::DataFrame
myDS = findall(modelData[!,:dataset].==dataset);
datasetData = modelData[myDS,:];
if any(names(datasetData).=="tissueid")
tissues = unique(modelData[!,:tissueid])
for tissue = tissues
datasetData=removeTissueMean!(datasetData, tissue);
end
end
datasetData[!,:x] = scale(datasetData[!,:x]);
datasetData[!,:y] = scale(datasetData[!,:y]);
return datasetData
end
function getBootSample(modelData::DataFrame)::DataFrame
sampledDatasets = rand(unique(modelData[!,:dataset]),
length(unique(modelData[!,:dataset])));
resampled = map(x -> sampleWithinDataset(modelData,x), sampledDatasets);
resampled = reduce(append!, resampled);
return resampled
end
function standardizeByDataset(modelData::DataFrame)
sampledDatasets = unique(modelData[!,:dataset]);
standardized = map(x -> scaleWithinDataset(modelData,x), sampledDatasets);
standardized = reduce(append!, standardized);
return standardized
end
t = zeros(R);
Threads.@threads for i = 1:R::Int64
resampled = getBootSample(modelData);
nDSS = length(unique(resampled[!,:dataset]));
if nDSS > one(nDSS)
# @suppress begin
m1 = fit(LinearMixedModel, @formula(y ~ (x + 0| dataset) + x + 0), resampled);
t[i] = coef(m1)[1];
# end
else
# @suppress begin
m1 = fit(LinearModel, @formula(y ~ x + 0), resampled);
t[i] = coef(m1)[1];
# end
end
end
# this takes 8 seconds for 1e4, seems to scale linearly from here. 20x improvement!
# threaded on 40 threads, performance pan-cancer is 100000 in 94 seconds.
modelData2 = standardizeByDataset(modelData);
m0 = fit(LinearMixedModel, @formula(y ~ (x + 0| dataset) + x + 0), modelData2);
t0 = coef(m0)[1];
# badchars = r"[,]|[;]|[:]|[-]|[+]|[*]|[%]|[$]|[#]|[{]|[}]|[[]|[]]|[|]|[\^]|[/]|[\\]|[ ]"
# tissueClean = replace(tissue, badchars => s".")
# drugClean = replace(drug, badchars => s".")
outfile = outFileName;
open(outfile, "w") do f
println(f, "t0:");
println(f, t0);
println(f, "N:");
println(f, nrow(modelData));
println(f, "R:");
println(f, R);
println(f, "t:");
for i in t
println(f, i)
end
end
# PD.0325901 Lung ENSG00000130477