-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_data.py
73 lines (58 loc) · 1.65 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
import sys
import numpy as np
import pandas as pd
from keras.utils import np_utils
from utils import get_gene_ontology
from collections import deque
DATA_ROOT = 'data/fofe/'
FILENAME = 'train.txt'
go = get_gene_ontology('go.obo')
def get_go_set(go_id):
go_set = set()
q = deque()
q.append(go_id)
while len(q) > 0:
g_id = q.popleft()
go_set.add(g_id)
for ch_id in go[g_id]['children']:
q.append(ch_id)
return go_set
functions = get_go_set('GO:0003674')
def get_anchestors(go_id):
go_set = set()
q = deque()
q.append(go_id)
while(len(q) > 0):
g_id = q.popleft()
go_set.add(g_id)
for parent_id in go[g_id]['is_a']:
if parent_id in go:
q.append(parent_id)
return go_set
def load_data():
proteins = list()
sequences = list()
gos = list()
with open(DATA_ROOT + FILENAME, 'r') as f:
for line in f:
items = line.strip().split('\t')
proteins.append(items[0])
sequences.append(items[1])
go_set = set()
for go_id in items[2].split('; '):
if go_id in functions:
go_set |= get_anchestors(go_id)
go_set.remove('GO:0003674')
gos.append(list(go_set))
return proteins, sequences, gos
def main(*args, **kwargs):
proteins, sequences, gos = load_data()
data = {
'proteins': np.array(proteins),
'sequences': np.array(sequences),
'gos': np.array(gos)}
df = pd.DataFrame(data)
df.to_pickle(DATA_ROOT + 'train.pkl')
if __name__ == '__main__':
main(*sys.argv)