-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgen_next_level_data.py
executable file
·122 lines (106 loc) · 3.38 KB
/
gen_next_level_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
'''
THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python gen_next_level_data.py
'''
import numpy
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import (
Dense, Dropout, Activation, Flatten, Merge, Highway)
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from sklearn.metrics import classification_report
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint, EarlyStopping
from utils import (
train_test_split, normalize_aa,
shuffle, encode_seq_one_hot, encode_seq, encode_seq_hydro,
get_gene_ontology
)
import os
import sys
import pdb
from keras.optimizers import Adam
import shutil
from collections import deque
import pandas as pd
LAMBDA = 24
DATA_ROOT = 'data/fofe/'
CUR_LEVEL = 'level_1/'
NEXT_LEVEL = 'level_2/'
go = get_gene_ontology()
go_model = dict()
MAXLEN = 500
def get_gos_by_prot_id():
data = dict()
with open(DATA_ROOT + 'train.txt', 'r') as f:
prot_id = 0
for line in f:
line = line.strip().split('\t')
gos = line[2].split('; ')
go_set = set()
for go_id in gos:
go_set.add(go_id)
data[prot_id] = go_set
prot_id += 1
return data
def load_data(parent_id, go_id):
df = pd.read_pickle(
DATA_ROOT + CUR_LEVEL + parent_id + '/' + go_id + '.pkl')
n = 0
for l in df['labels']:
if l == 1:
break
n += 1
df = df[n:]
df = df.reindex()
return df
def get_model(go_id, parent_id):
filepath = DATA_ROOT + CUR_LEVEL + parent_id + '/' + go_id + '.hdf5'
model = Sequential()
model.add(Dense(8000, activation='relu', input_dim=8000))
model.add(Highway())
model.add(Dense(1, activation='sigmoid'))
model.compile(
loss='binary_crossentropy', optimizer='rmsprop', class_mode='binary')
# Loading saved weights
print 'Loading weights for ' + go_id
model.load_weights(filepath)
return model
def main(*args, **kwargs):
if len(args) < 3:
raise Exception('Please provide function id')
parent_id = args[1]
go_id = args[2]
if len(args) == 4:
level = int(args[3])
global CUR_LEVEL
global NEXT_LEVEL
CUR_LEVEL = 'level_' + str(level) + '/'
NEXT_LEVEL = 'level_' + str(level + 1) + '/'
df = load_data(parent_id, go_id)
go_sets = get_gos_by_prot_id()
model = get_model(go_id, parent_id)
data = df['data'].as_matrix()
data = numpy.hstack(data).reshape(data.shape[0], 8000)
pred = model.predict_classes(
data,
batch_size=16,
verbose=1)
gos = list()
index = list()
for i in range(len(data)):
if pred[i] == 1:
index.append(df.index[i])
gos.append(list(go_sets[df['proteins'][df.index[i]]]))
df = df.reindex(index)
df['gos'] = pd.Series(gos, index=df.index)
dirpath = DATA_ROOT + NEXT_LEVEL + 'data/'
if not os.path.exists(dirpath):
os.makedirs(dirpath)
df.to_pickle(DATA_ROOT + NEXT_LEVEL + 'data/' + go_id + '.pkl')
model_path = DATA_ROOT + CUR_LEVEL + parent_id + '/' + go_id + '.hdf5'
dst_model_path = DATA_ROOT + NEXT_LEVEL + 'data/' + go_id + '.hdf5'
shutil.copyfile(model_path, dst_model_path)
if __name__ == '__main__':
main(*sys.argv)