-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathinput_parser.py
More file actions
83 lines (61 loc) · 2.46 KB
/
input_parser.py
File metadata and controls
83 lines (61 loc) · 2.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from tree import Node, Tree
def main(fpath):
fhand = open(fpath,'r')
header = fhand.readline()
rankInd,nameInd,totalInd,samples = parse_header(header)
## Create a dictionary which has every sample as a key. The value will be a
## tree data structure and add the root info, the second line is expected
## to contain the information of the root node.
Data = {}
root_info = fhand.readline()
rankID,name,total,taxlvl,sample_values = parse_line(root_info,rankInd,
nameInd,totalInd,samples)
for sample_name in samples:
Data[sample_name] = Tree(name=sample_name)
root_node = Node(name='{}:root'.format(sample_name),
count=sample_values[sample_name],level=[0])
Data[sample_name].root = root_node
## Add rest of the information
dataline = fhand.readline()
while dataline:
rankID,name,total,taxlvl,sample_values = parse_line(dataline,rankInd,
nameInd,totalInd,samples)
depth = taxlvl
level = rankID.split('.')
for samp in samples:
tree = Data[samp]
count = sample_values[samp]
sampInd = samples[samp]
samp_node = Node(name=name,level=level,count=count)
tree.add_node(samp_node,level)
dataline = fhand.readline()
fhand.close()
return Data,list(samples)
def hierarchy(rank):
rank = rank.split('.')
i = 0
while i < len(rank):
yield ('.').join(rank[0:i+1])
i+= 1
def parse_line(line,rankInd,nameInd,totalInd,sampleInds):
line = line.replace('\n','').split('\t')
rankID = line[rankInd]
name = line[nameInd]
total = int(line[totalInd])
taxlvl = rankID.count('.') ## This is the depth of the node
sample_values = {}
for i in sampleInds:
sample_values[i] = int(line[sampleInds[i]])
return rankID,name,total,taxlvl,sample_values
def parse_header(line):
''' This function reads the header and stores the location of each column
Column header example - level rank name total sample1 sample2...'''
line = line.replace(' ','').replace('\n','').split('\t')
rankInd = line.index('rankID')
nameInd = line.index('taxon')
totalInd = line.index('total')
samples = {}
for i in line[totalInd+1:]:
if i != '':
samples[i] = line.index(i)
return rankInd,nameInd,totalInd,samples