-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun_permutation_test.py
165 lines (135 loc) · 6.4 KB
/
run_permutation_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""
Description:
This script iterates through a directory of network files
generated from the `run_network_creation.py` script and applies a
permutation test to the networks.
If there is more than 1 network file in the directory,
the networks are combined to make a single aggregate network.
Edges that are significant under their corresponding nulls
(generated by the permutation test) are kept in the resulting
final (aggregate, if applicable) network.
Currently, defaults are set up for the eADAGE analysis provided in
the PathCORE-T paper.
Output:
- <significant-edges-file> [default: significant_edges.tsv]
significant edges with the following columns:
pw0, pw1, p-value, q-value (FDR-corrected p-value), odds ratio
where pw0 (pathway 0) and pw1 (pathway 1) are the two pathways
that share an edge.
- <filtered-network-file> [default: filtered_network.tsv]
Network file that only contains the edges
specified in `significant_edges.tsv.`
Edge weight is updated to be an odds ratio:
observed / average expected weight
If this is an aggregate network, the feature column
should be empty.
Both file names can be changed through command-line arguments
(see documentation)
Usage:
run_permutation_test.py
<significant-pws-dir> <output-dir>
[--n-permutations=<n-permutations>]
[--n-features=<n-features>] [--n-cores=<n-cores>]
[--alpha=<alpha>]
[--edges-out=<edges-file>]
[--network-out=<network-file>]
[--shorten=<pathway-shorten>]
[--random-seed=<random-seed>]
run_permutation_test.py -h | --help
Options:
-h --help Show this screen
<significant-pws-dir> Path to the directory containing the
significant pathways files generated by
running `build_co_occurrence_network.py`
<output-dir> Path to the directory that will store
the output files. Will be automatically
created if the directory does not
currently exist
--n-permutations=<n-permutations> Number of permutations generated for a
network
[default: 10000]
--n-features=<n-features> Number of constructed features in each
model
[default: 300]
--n-cores=<n-cores> Number of cores used to run the analysis
on models in parallel
[default: num. available cores - 1]
--alpha=<alpha> Significance level for pathway
enrichment. Overrepresentation is
determined by a Fisher's exact test
with false-discovery rate correction
Benjamini-Hochberg (1995, 2000)
[default: 0.05]
--edges-out=<edges-file> Filename to store the edges that are
significant after the permutation test
[default: significant_edges.tsv]
--network-out=<network-file> Filename to store the (aggregate)
network containing only the significant
edges
[default: filtered_network.tsv]
--random-seed=<random-seed> Set the random state for the
permutation test
[default: 640]
"""
from copy import deepcopy
import multiprocessing
import os
import random
from time import time
from docopt import docopt
from joblib import Parallel, delayed
from pathcore import CoNetwork
from pathcore import aggregate_permuted_network, \
network_edges_permutation_test
if __name__ == "__main__":
arguments = docopt(
__doc__, version="run permutation test 1.0")
significant_pathways_directory = arguments["<significant-pws-dir>"]
output_directory = arguments["<output-dir>"]
n_permutations = int(arguments["--n-permutations"])
n_features = int(arguments["--n-features"])
alpha = float(arguments["--alpha"])
edges_filename = arguments["--edges-out"]
final_network_filename = arguments["--network-out"]
random_seed = int(arguments["--random-seed"])
random.seed(random_seed)
os.makedirs(arguments["<output-dir>"], exist_ok=True)
if arguments["--n-cores"].isdigit():
n_cores = int(arguments["--n-cores"])
else:
n_cores = multiprocessing.cpu_count() - 1
t_o = time()
aggregate_observed_network = None
individual_networks = []
for filename in os.listdir(significant_pathways_directory):
path_to_significant_pathways_file = os.path.join(
significant_pathways_directory, filename)
if os.path.isdir(path_to_significant_pathways_file):
continue
network_object = CoNetwork(
n_features,
significant_pathways=path_to_significant_pathways_file)
individual_networks.append(network_object)
if not aggregate_observed_network:
aggregate_observed_network = deepcopy(network_object)
else:
aggregate_observed_network.aggregate(network_object)
with Parallel(n_jobs=n_cores) as parallel:
permutations = parallel(
delayed(aggregate_permuted_network)(individual_networks)
for _ in range(n_permutations))
t_f = time() - t_o
n_networks = len(individual_networks)
print("{0} permutations of {1} network(s) took "
"{2} seconds to run on {3} core(s).".format(
n_permutations, n_networks, t_f, n_cores))
# this function returns a CoNetwork object. because the network is
# also written to `arguments["--network-out"]`, we ignore the
# return value in this context.
network_edges_permutation_test(
aggregate_observed_network, permutations, alpha,
n_networks=n_networks,
output_edges_to_file=os.path.join(
output_directory, arguments["--edges-out"]),
output_network_to_file=os.path.join(
output_directory, arguments["--network-out"]))