-
Notifications
You must be signed in to change notification settings - Fork 1
/
hhotnet_perm_treecut_stats_chunk.jl
143 lines (134 loc) · 6.63 KB
/
hhotnet_perm_treecut_stats_chunk.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#=
job_info = (project = "cov2",
name = "cov2_permtrees_stats",
hotnet_ver = "20200327",
id = 0,
chunk = 2,
ntrees_perchunk = 2)
=#
job_info = (project = ARGS[1],
name = ARGS[2],
hotnet_ver = ARGS[3],
id = parse(Int, ARGS[4]),
chunk = parse(Int, ARGS[5]),
ntrees_perchunk = 50)
@info "Tree stats for $(job_info.project) (job '$(job_info.name)' id=$(job_info.id))" *
" chunk #$(job_info.chunk)"
proj_info = (id = job_info.project,
hotnet_ver = job_info.hotnet_ver)
const misc_scripts_path = joinpath(base_scripts_path, "misc_jl");
const analysis_path = joinpath(base_analysis_path, proj_info.id)
const data_path = joinpath(analysis_path, "data")
const results_path = joinpath(analysis_path, "results")
const scratch_path = joinpath(base_scratch_path, proj_info.id)
const plots_path = joinpath(analysis_path, "plots")
using JLD2, DataFrames, Serialization, CodecZstd
using LinearAlgebra, HierarchicalHotNet
HHN = HierarchicalHotNet
_, _, _, _,
bait2apms_vertices, _,
bait_ids, bait2vertex_weights,
reactomefi_walkmtx, bait2reactomefi_tree,
_, bait2vertex_weights_perms =
open(ZstdDecompressorStream, joinpath(scratch_path, "$(proj_info.id)_hotnet_prepare_$(proj_info.hotnet_ver).jlser.zst"), "r") do io
deserialize(io)
end
#=@load(joinpath(scratch_path, "$(proj_info.id)_hotnet_prepare_$(proj_info.hotnet_ver).jld2"),
bait_ids, bait2vertex_weights,
bait2apms_vertices,
reactomefi_walkmtx, bait2reactomefi_tree,
bait2vertex_weights_perms)=#
#=@load(joinpath(scratch_path, "$(proj_info.id)_hotnet_permtrees_$(proj_info.hotnet_ver).jld2"),
bait2reactomefi_perm_trees)
=#
bait2reactomefi_perm_trees = open(ZstdDecompressorStream, joinpath(scratch_path, "$(proj_info.id)_hotnet_permtrees_$(proj_info.hotnet_ver).jlser.zst"), "r") do io
deserialize(io)
end
vertex_stats_df = open(ZstdDecompressorStream, joinpath(scratch_path, "$(proj_info.id)_hotnet_vertex_stats_$(proj_info.hotnet_ver).jlser.zst"), "r") do io
deserialize(io)
end
#@load(joinpath(scratch_path, "$(proj_info.id)_hotnet_vertex_stats_$(proj_info.hotnet_ver).jld2"),
# vertex_stats_df)
ntrees_pergroup = size.(getindex.(Ref(bait2vertex_weights_perms), bait_ids), 2)
pushfirst!(ntrees_pergroup, length(bait_ids)) # non-permuted trees
nchunks_pergroup = fld1.(ntrees_pergroup, job_info.ntrees_perchunk)
group2lastchunk = cumsum(nchunks_pergroup)
chunkgroup = searchsortedfirst(group2lastchunk, job_info.chunk)
@assert (chunkgroup <= length(group2lastchunk)) "Chunk #$(job_info.chunk) is outside of the valid chunks range"
localchunk = job_info.chunk - (chunkgroup > 1 ? group2lastchunk[chunkgroup-1] : 0)
# indices of chunk trees to process (within bait_id perm_trees vector)
chunk_trees = ((localchunk-1)*job_info.ntrees_perchunk + 1):min(
localchunk*job_info.ntrees_perchunk,
ntrees_pergroup[chunkgroup])
treestats_dfs = sizehint!(Vector{DataFrame}(), length(chunk_trees))
W = eltype(reactomefi_walkmtx)
tree_mtx = similar(reactomefi_walkmtx)
function append_missing_slowstats!(statsdf)
statsdf[!, :topn_nsinks] = missings(Int, nrow(statsdf))
statsdf[!, :ncompsources] = missings(Int, nrow(statsdf))
statsdf[!, :ncompsinks] = missings(Int, nrow(statsdf))
statsdf[!, :nflows] = missings(Int, nrow(statsdf))
statsdf[!, :ncompflows] = missings(Int, nrow(statsdf))
statsdf[!, :flow_avglen] = missings(Float64, nrow(statsdf))
statsdf[!, :compflow_avglen] = missings(Float64, nrow(statsdf))
statsdf[!, :flow_distance] = missings(Float64, nrow(statsdf))
statsdf[!, :compflow_distance] = missings(Float64, nrow(statsdf))
return statsdf
end
if chunkgroup > 1
bait_id = bait_ids[chunkgroup-1]
apms_vertices = get(bait2apms_vertices, bait_id, nothing)
isnothing(apms_vertices) && @warn("No APMS vertices for bait $bait_id")
vertex_weights_perms = bait2vertex_weights_perms[bait_id]
empty!(bait2vertex_weights_perms) # release unneeded
permtrees = bait2reactomefi_perm_trees[bait_id]
empty!(bait2reactomefi_perm_trees) # release unneeded
vertex_bait_stats_df = filter(r -> r.bait_full_id == bait_id, vertex_stats_df)
@info "Processing $bait_id permuted trees $(chunk_trees)"
for i in chunk_trees
@info "Treecut statistics for $bait_id permuted tree #$i"
vertex_weights = convert(Vector{W}, view(vertex_weights_perms, :, i))
treestats_df = HHN.treecut_stats(permtrees[i], vertex_bait_stats_df,
mannwhitney_tests=true,
walkmatrix=mul!(tree_mtx, reactomefi_walkmtx, Diagonal(vertex_weights)),
sources=findall(>(0), vertex_weights),
sinks=apms_vertices,
nflows_ratio=0.99,
pools=nothing)
isnothing(apms_vertices) && append_missing_slowstats!(treestats_df)
treestats_df[!, :tree] .= i
treestats_df[!, :bait_full_id] .= bait_id
treestats_df[!, :is_permuted] .= true
push!(treestats_dfs, treestats_df)
end
else
@info "Processing non-permuted trees"
empty!(bait2vertex_weights_perms) # release unneeded
empty!(bait2reactomefi_perm_trees) # release unneeded
for i in chunk_trees
bait_id = bait_ids[i]
@info "Treecut statistics for non-permuted tree of $bait_id"
apms_vertices = get(bait2apms_vertices, bait_id, nothing)
vertex_weights = convert(Vector{W}, bait2vertex_weights[bait_id])
treestats_df = HHN.treecut_stats(bait2reactomefi_tree[bait_id],
filter(r -> r.bait_full_id == bait_id, vertex_stats_df),
mannwhitney_tests=true,
walkmatrix=mul!(tree_mtx, reactomefi_walkmtx, Diagonal(vertex_weights)),
sources=findall(>(0), vertex_weights),
sinks=apms_vertices,
nflows_ratio=0.999,
pools=nothing)
isnothing(apms_vertices) && append_missing_slowstats!(treestats_df)
treestats_df[!, :tree] .= 0
treestats_df[!, :bait_full_id] .= bait_id
treestats_df[!, :is_permuted] .= false
push!(treestats_dfs, treestats_df)
end
bait_id = nothing
end
chunkstats_df = reduce(vcat, treestats_dfs)
chunk_prefix = "$(proj_info.id)_hotnet_perm_treecut_stats_$(proj_info.hotnet_ver)"
open(ZstdCompressorStream, joinpath(scratch_path, chunk_prefix, "$(chunk_prefix)_$(job_info.chunk).jlser.zst"), "w") do io
serialize(io, (job_info, bait_id, chunk_trees, chunkstats_df))
end
@info "Permuted trees treecut statistics chunk #$(job_info.chunk) complete"