-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkeg2table.py
197 lines (167 loc) · 6.01 KB
/
keg2table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#Exports .keg BRITE hierarchy (htext) into a parsable table format
import re
import sqlite3
import tablib
from argparse import ArgumentParser
ec_pattern = re.compile(r"(\d+\.){3}\d")
"""KEGG htext stores level of hierarchy as a capital letter, starting at A
This returns a numerical value corresponding to the level of hierarchy
"""
def hlevel(htext_line):
return ord(htext_line[:1])-65
def hchar(level):
return chr(level+65)
#It is expected that openning and closing sequences (a and b) are different
def between (s, a, b):
return s[s.find(a)+len(a) : s.find(b)]
#It is expected that openning and closing sequences (a and b) are different
def rbetween (s, a, b):
return s[s.rfind(a)+len(a) : s.rfind(b)]
#Strip the outermost pair of tags
def strip_tags(string):
if ("<" in string and ">" in string):
tag = between(string,"<",">")
return between(string,"<{}>".format(tag),"</{}>".format(tag))
else:
return string
"""Line contains an ID, rest of line is a description
"""
def parse_id_line (content):
tokens = content.split()
return tokens[0]," ".join(tokens[1:])
"""Line contains an ID, an optional comma separated list ended with a semicolon and a description
"""
def parse_description_line (content):
tokens = content.split()
name = tokens[0]
#try to find a comma separated list in the content
optional_list = []
list_end_index = 1
for t in tokens[1:]:
if t.endswith(","):
optional_list.append(t[:-1])
elif t.endswith(";"):
optional_list.append(t[:-1])
list_end_index += len(optional_list)
break
else:
break
alternative_names = " ".join(optional_list)
description = " ".join(tokens[list_end_index:])
return name, alternative_names, description
"""Line contains an EC and a description
"""
def parse_ec_line (content):
content = strip_tags(content)
tokens = content.split()
ec = tokens[0]
description = " ".join(tokens[1:])
return ec, description
"""Line contains an ID, description and a list with accessor in square brackets i.e. [ACC: 1 2 3]
"""
def parse_kegg_line (content):
tokens = content.split()
name = tokens[0]
list_content = rbetween(content,"[","]")
list_accessor = list_content[:list_content.find(":")]
list_items = list_content[list_content.find(":")+1:]
description = between(content,name,list_content)[1:-2]
return name,description,list_accessor,list_items
class Stack:
def __init__(self):
self.stack = list()
def push (self, item):
self.stack.append(item)
def pop (self):
last = self.stack[-1]
self.stack = self.stack[:-1]
return last
def top (self):
return self.stack[-1]
def content (self):
return reversed(self.stack)
def height (self):
return len(self.stack)
class KeggParser:
map_header = ["Gene name","Common names","Gene description","KO number","KO description","Map","KO mapping"]
enzyme_header = ["4EC","4EC description","3EC","3EC description","2EC","2EC description","Enzyme type","Enzyme type description"]
def __init__(self, encoding_line):
self.kegg_map=False
if encoding_line.startswith("+"):
tokens = encoding_line.split()
self.data_level = hlevel(tokens[0][1:])
self.data_columns = tokens[1:]
if tokens[1] == "Enzyme":
self.enzyme_classification = True
if len(self.data_columns) == 2:
self.kegg_map = True
"""Some examples"""
a = "64144 Mllt1, AA407901, BAM11, ENL, LTG19; myeloid/lymphoid or mixed-lineage leukemia (trithorax homolog, Drosophila); translocated to, 1"
b = "17355 AF4/FMR2 family, member 1"
c = "214162 Mll1, 6430520K01, ALL-1, All1, Cxxc7, HRX, HTRX1, KIAA4050, KMT2A, Mll, mKIAA4050; myeloid/lymphoid or mixed-lineage leukemia 1 (EC:2.1.1.43)"
d = "K00567 methylated-DNA-[protein]-cysteine S-methyltransferase [EC:2.1.1.63]"
if __name__=="__main__":
parser = ArgumentParser("Parses a '.keg' BRITE hierarchy file (htext)")
parser.add_argument("keg_file",type=open,help="Input .keg file")
parser.add_argument("output_file",type=str,help="Name of the output file or 'stdin'")
parser.add_argument("-output_format",type=str,default="csv",
choices=["xls","json","yaml","html","tsv","csv","sqlite"],help="Format of the output table (default .csv)")
args = parser.parse_args()
stack = Stack()
header_line = list()
for i, line in enumerate(args.keg_file):
#first line is a data format header
if (i == 0):
parser = KeggParser(line)
if parser.kegg_map:
for column in parser.map_header:
header_line.append(column)
if parser.enzyme_classification:
for column in parser.enzyme_header:
header_line.append(column)
else:
for column in parser.data_columns:
header_line.append(column)
#python-fu: reverse traverse list without last element
for category in range(parser.data_level-1,-1,-1):
header_line.append(hchar(category))
data = tablib.Dataset(headers=header_line)
#print data.headers,len(data.headers)
continue
if (line.startswith("#") or line.startswith("%")):
continue
level = hlevel(line)
#If we encounter a proper htext line:
if level >= 0:
content = line[1:].strip()
#Make sure the line is not empty
if content:
table_line = list()
#Close any previously opened categories as necessary
while stack.height() and level <= hlevel(stack.top()):
stack.pop()
if level == parser.data_level:
if (len(parser.data_columns) == 1):
table_line.append(content)
elif (parser.kegg_map):
entity, classification = content.split("\t")
for column in parse_description_line(entity):
table_line.append(column)
for column in parse_kegg_line(classification):
table_line.append(column)
else:
for field in content.split("\t"):
table_line.append(field)
for category in stack.content():
if parser.enzyme_classification:
for token in parse_ec_line(category[1:].strip()):
table_line.append(token)
else:
table_line.append(category[1:].strip())
#print table_line,len(table_line)
data.append(table_line)
else:
stack.push(line)
#TODO:handle STDIN as the output file
open ("simple.xls","wb").write(data.xls)
open(args.output_file,"wb").write(data.html)