-
Notifications
You must be signed in to change notification settings - Fork 2
/
map_taxonomy.py
54 lines (48 loc) · 2.19 KB
/
map_taxonomy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import csv
import sys
csv.field_size_limit(sys.maxsize)
# parse protein2ipr to output just the pfam domains
name_lookup = {}
# print("Reading TAXA names")
with open('/scratch1/NOT_BACKED_UP/dbuchan/ncbi_taxonomy/names.dmp') as namesfile:
namesreader = csv.reader(namesfile, delimiter='|', quotechar='\'')
for row in namesreader:
clean_row = [x.rstrip().lstrip() for x in row]
if 'scientific name' in clean_row[3]:
# print(clean_row)
clean_row[1] = clean_row[1].replace(',', '')
name_lookup[clean_row[0]] = {}
name_lookup[clean_row[0]]['name'] = clean_row[1]
name_lookup[clean_row[0]]['kingdom'] = 'unknown'
# print("Reading TAXA categories")
with open('/scratch1/NOT_BACKED_UP/dbuchan/ncbi_taxonomy/categories.dmp') as catfile:
catreader = csv.reader(catfile, delimiter='\t', quotechar='\'')
for row in catreader:
if row[1] in name_lookup:
name_lookup[row[1]]['kingdom'] = row[0]
if row[2] in name_lookup:
name_lookup[row[1]]['kingdom'] = row[0]
# print(name_lookup)
uniprot_lookup = {}
# print("Annotating UNIPROT")
with open('/scratch1/NOT_BACKED_UP/dbuchan/uniprot/idmapping_selected.tab') as mapping:
mappingreader = csv.reader(mapping, delimiter='\t', quotechar='\'')
for row in mappingreader:
if row[12] in name_lookup:
# print(row)
uniprot_lookup[row[0]] = name_lookup[row[12]]
uniprot_lookup[row[0]]['taxaid'] = row[12]
# break
# print("Annotating UNIPROT PFam")
# print(uniprot_lookup)
with open('/scratch1/NOT_BACKED_UP/dbuchan/interpro/derived/protein2ipr_pfam.dat') as pfam:
# with open('/scratch1/NOT_BACKED_UP/dbuchan/interpro/masked_regions.dat') as pfam:
#with open('/scratch1/NOT_BACKED_UP/dbuchan/interpro/disorder_regions.dat') as pfam:
pfamreader = csv.reader(pfam, delimiter=',', quotechar='\'')
for row in pfamreader:
if row[0] in uniprot_lookup:
new_line = [row[0], uniprot_lookup[row[0]]['taxaid'],
uniprot_lookup[row[0]]['kingdom'],
uniprot_lookup[row[0]]['name']] + row[1:]
print(",".join(new_line))
sys.stdout.flush()