-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathweb_edge_page_data.py
312 lines (271 loc) · 13.3 KB
/
web_edge_page_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
"""Should be run *after* `web_initialize_db.py`.
For every edge, create a page reporting on its underlying genes.
This script computes and inserts the page information into a MongoDB
collection called `pathcore_edge_data`.
Defaults are the ones used for the PAO1 KEGG eADAGE network demo server.
Usage:
web_edge_page_data.py <network> <db-credentials>
[--features=<n-features>]
[--samples=<keep-n-samples>] [--genes=<keep-n-genes>]
web_edge_page_data.py -h | --help
Options:
-h --help Show this screen.
<network> Path to the network file that will be
visualized on the web application. Expects
that this is the network output from the
permutation test in the full PathCORE-T analysis.
<db-credentials> Path to the .yml file containing the necessary
DB credentials to access an mLab-served MongoDB
database.
--features=<n-features> Number of features constructed in each model.
[default: 300]
--samples=<keep-n-samples> The number of most/least expressed samples
(each) shown on an edge page.
[default: 20]
--genes=<keep-n-genes> The number of genes shown on an edge page.
[default: 20]
"""
import sys
from docopt import docopt
from pymongo import ASCENDING # for indexing the Collection
from pymongo import MongoClient
import pandas as pd
import yaml
def index_sort(list_to_sort):
"""
Parameters
-----------
list_to_sort : list(Any)
List of elements, any type
Returns
-----------
list(tup(int, Any)), a list of tuples, where
tup[0] is the position of the element in the original list and
tup[1] is the element itself.
The tuples in the list are ordered in descending order
based on tup[1] element comparisons.
"""
return [index_element_tup for index_element_tup
in sorted(enumerate(list_to_sort), key=lambda x: -x[1])]
class EdgeInfo:
"""Functions in this class are used to generate the JSON object
corresponding to each edge page"""
def __init__(self, db, n_features, keep_n_samples=20, keep_n_genes=20):
"""
Parameters
-----------
db : pymongo.database.Database
The database retrieved using MongoClient
n_features : int
The number of features from which each network was constructed
(# features constructed in a model)
keep_n_samples : int (default=20)
The number of samples to display in each heatmap. Limited so
the heatmap fits on a web page and can be manually reviewed.
keep_n_genes : int (default=20)
The number of genes to display in each heatmap. Limited so
the heatmap fits on a web page and can be manually reviewed.
"""
self.db = db
self.n_features = n_features
# the final network may be an aggregate of multiple networks
self.n_networks = len(self.db.network_edges.distinct("network"))
self.n_total_features = self.n_features * self.n_networks
self.keep_n_samples = keep_n_samples
self.keep_n_genes = keep_n_genes
def get_edge_info(self, pw0_name, pw1_name):
"""
Parameters
-----------
pw0_name : str
Pathway 0 in the edge
pw1_name : str
Pathway 1 in the edge
Returns
-----------
dict(), the dict that will be inserted into the MongoDB collection
`pathcore_edge_data` as a JSON object. Contains the edge page
information that will be parsed and displayed on the web application.
"""
pw0_info = self.db.pathways.find_one({"pathway": pw0_name})
pw1_info = self.db.pathways.find_one({"pathway": pw1_name})
pw0_id = pw0_info["_id"]
pw1_id = pw1_info["_id"]
pw0_annotations = set(pw0_info["annotations"])
pw1_annotations = set(pw1_info["annotations"])
genes_annotated_to_edge = pw0_annotations | pw1_annotations
# specify the pathway(s) to which a gene is annotated
genes_which_pathway = {}
for gene in genes_annotated_to_edge:
if gene in pw0_annotations and gene not in pw1_annotations:
genes_which_pathway[gene] = 0
elif gene in pw1_annotations and gene not in pw0_annotations:
genes_which_pathway[gene] = 1
else: # annotated to both
genes_which_pathway[gene] = 2
# track how many features contain both this gene and this edge
count_gene_edge_common = {}
for gene in genes_annotated_to_edge:
count_gene_edge_common[gene] = 0
edge_in_networks = self.db.network_edges.find(
{"edge": [pw0_id, pw1_id]})
n_features_contain_edge = 0
for edge in edge_in_networks:
network = edge["network"]
features = edge["features"]
n_features_contain_edge += len(features)
# look at the gene signatures of all features that contain
# this edge
all_feature_infos = self.db.network_feature_signatures.find(
{"network": network, "feature": {"$in": features}})
for feature_info in all_feature_infos:
signature = set()
if "positive_signature" in feature_info:
signature |= set(feature_info["positive_signature"])
if "negative_signature" in feature_info:
signature |= set(feature_info["negative_signature"])
for gene in genes_annotated_to_edge & signature:
count_gene_edge_common[gene] += 1
odds_ratio_information = self._edge_pathway_annotation_odds_ratios(
genes_which_pathway, n_features_contain_edge,
count_gene_edge_common)
return self._edge_page_data(odds_ratio_information)
def _edge_pathway_annotation_odds_ratios(self,
which_pathway,
n_features_contain_edge,
count_gene_edge_common):
"""Helper function computes the odds ratio of every gene annotated to
the edge (that is, annotated to one or both of the pathways that make
up the edge), and returns the odds ratio information for up to
`self.keep_n_genes` genes with an odds ratio above 1.
"""
odds_ratios = {}
odds_ratio_denominator = (n_features_contain_edge /
float(self.n_total_features))
for gene_id, count in count_gene_edge_common.items():
# number of times this gene appears in any feature's gene signature
# (not limited to the features that contain the edge)
n_features_contain_gene = self.db.network_feature_signatures.find(
{"$or": [{"positive_signature": gene_id},
{"negative_signature": gene_id}]}).count()
# edge case: divide by 0 occurs if the annotated gene (from the
# pathway definitions file) was not in the transcriptomic dataset
if not count and not n_features_contain_gene:
continue
# `count` (number of times the edge AND gene appear) is normalized
# by the number of times the gene appears in any feature
odds_ratio_numerator = count / float(n_features_contain_gene)
odds_ratios[gene_id] = (odds_ratio_numerator /
odds_ratio_denominator)
odds_ratios_above_one = []
for gene_id, odds_ratio in odds_ratios.items():
if odds_ratio > 1.:
gene = self.db.genes.find_one({"_id": gene_id})
gene_name = gene["gene"]
if "common_name" in gene:
gene_name = gene["common_name"]
odds_ratios_above_one.append(
(str(gene_name), odds_ratio, which_pathway[gene_id]))
# descending by odds ratio
odds_ratios_above_one.sort(key=lambda x: -x[1])
limit_genes_returned = min(
len(odds_ratios_above_one), self.keep_n_genes)
return odds_ratios_above_one[:limit_genes_returned]
def _edge_page_data(self, gene_odds_ratio_list):
"""Helper function creates the dict (MongoDB JSON object) that contains
all information needed for the PathCORE-T web application edge page.
"""
if not gene_odds_ratio_list: # no genes to display for this edge
return {"flag": 1}
norm_sample_matrix = []
sum_odds_ratios = 0.0
# `norm_sample_matrix` shape = [n_genes, n_samples]
for gene_name, odds_ratio, belongs_to in gene_odds_ratio_list:
norm_sample_matrix.append(
self.db.genes.find_one(
{"$or": [{"gene": gene_name},
{"common_name": gene_name}]})["expression"])
sum_odds_ratios += odds_ratio
# transpose => shape [n_samples, n_genes]
norm_sample_matrix = [list(l) for l in zip(*norm_sample_matrix)]
# for each sample, compute a summary "sample score"
sample_scores = []
for sample_data in norm_sample_matrix:
score = 0.
for index, gene_expression in enumerate(sample_data):
_, gene_odds_ratio, _ = gene_odds_ratio_list[index]
weight = gene_odds_ratio / sum_odds_ratios
score += weight * gene_expression
sample_scores.append(score)
index_sorted_scores = index_sort(sample_scores)
most_expressed_heatmap, most_expressed_samples = \
self._get_heatmap_data(
index_sorted_scores[:self.keep_n_samples], norm_sample_matrix)
least_expressed_heatmap, least_expressed_samples = \
self._get_heatmap_data(
index_sorted_scores[-self.keep_n_samples:], norm_sample_matrix)
# unzips a list of 3-element tuples
gene_names, odds_ratios, pw_owner = [list(l) for l in
zip(*gene_odds_ratio_list)]
return {"most_expressed_heatmap": most_expressed_heatmap,
"least_expressed_heatmap": least_expressed_heatmap,
"most_expressed_samples": most_expressed_samples,
"least_expressed_samples": least_expressed_samples,
"gene_names": gene_names,
"odds_ratios": odds_ratios,
"pathway_owner": pw_owner}
def _get_heatmap_data(self, index_sample_scores, expression_matrix):
"""Helper function returns 2 lists: one contains the
positions & expression values of every square in the heatmap and
the other contains the ordering of the samples (columns) of the heatmap
"""
heatmap_data = []
ordered_sample_labels = []
for rank, (index, score) in enumerate(index_sample_scores):
sample_label = self.db.sample_labels.find_one(
{"_id": index})["sample"]
ordered_sample_labels.append(str(sample_label))
for gene_index, gene_expression \
in enumerate(expression_matrix[index]):
heatmap_square = {"col_index": rank,
"row_index": gene_index,
"value": gene_expression}
heatmap_data.append(heatmap_square)
return (heatmap_data, ordered_sample_labels)
if __name__ == "__main__":
arguments = docopt(
__doc__, version="web application setup, 1.0")
network_file = arguments["<network>"]
db_credentials_file = arguments["<db-credentials>"]
n_features = int(arguments["--features"])
keep_n_samples = int(arguments["--samples"])
keep_n_genes = int(arguments["--genes"])
credentials_info_dict = {}
with open(db_credentials_file, "r") as credentials_info:
try:
credentials_info_dict = yaml.load(credentials_info)
except yaml.YAMLError as exception:
sys.exit(exception)
mongo_client = MongoClient("mongodb://{0}:{1}@{2}/{3}".format(
credentials_info_dict["user"], credentials_info_dict["password"],
credentials_info_dict["mLab_URI"], credentials_info_dict["db_name"]))
db = mongo_client[credentials_info_dict["db_name"]]
db.pathcore_edge_data.drop()
network = pd.read_table(network_file)
compute_edge_info = EdgeInfo(db,
n_features,
keep_n_samples=keep_n_samples,
keep_n_genes=keep_n_genes)
print("Started PathCORE-T edge data insertion operations")
edge_info_list = []
for index, row in network.iterrows():
edge_info = compute_edge_info.get_edge_info(
row["pw0"], row["pw1"])
edge_info["edge"] = (row["pw0"], row["pw1"])
edge_info["weight_odds_ratio"] = row["weight"]
edge_info_list.append(edge_info)
db.pathcore_edge_data.insert_many(edge_info_list)
db.pathcore_edge_data.create_index(
[("edge", ASCENDING),
("weight_odds_ratio", ASCENDING)])
print("Completed PathCORE-T edge data insertion operations")