-
Notifications
You must be signed in to change notification settings - Fork 2
/
construct_word2vec_strings.py
97 lines (86 loc) · 3.75 KB
/
construct_word2vec_strings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from itertools import takewhile
def return_gap_name(start, stop):
if (stop-start)+1 <= 100:
return("G100")
if (stop-start)+1 <= 200:
return("G200")
if (stop-start)+1 <= 300:
return("G300")
if (stop-start)+1 <= 400:
return("G400")
return("G500")
def handle_cache(data):
splits = []
for line in data:
splits.append(line.split(","))
sorted_data = sorted(splits, key=lambda x: x[6])
protein_length = int(sorted_data[0][8])
protein_name = sorted_data[0][0]
# STRINGS: Domain names: PFXXXX,
# Gaps; G100 G200 G300 G400 G500
# Disorder gaps; DG100 DG200 DG300 DG400 DG500
# Low Complexity Gaps: LG100 LG200 LG300 LG400 LG500
# CoileCoil Gaps: CG100 CG200 CG300 CG400 CG500
# loop over the PF domains and construct a list of gaps and PFam domains
previous_start = 0
domain_size = len(list(takewhile(lambda x: x[5].startswith("PF"), sorted_data)))
domains_and_gaps = []
for idx, domain in enumerate(takewhile(lambda x: x[5].startswith("PF"), sorted_data)):
if int(domain[6]) > previous_start+1:
if ((int(domain[6])-1) - (previous_start+1)) > 20:
gap_id = return_gap_name(previous_start+1, int(domain[6])-1)
domains_and_gaps.append([gap_id, domain[0], previous_start+1, int(domain[6])-1])
domains_and_gaps.append([domain[5], domain[0], domain[6], domain[7]])
else:
domains_and_gaps.append([domain[5], domain[0], domain[6], domain[7]])
if idx+1 == domain_size:
if int(domain[7]) < protein_length:
if (protein_length - int(domain[7])) > 20:
gap_id = return_gap_name(int(domain[7]), protein_length)
domains_and_gaps.append([gap_id, domain[0], int(domain[7]), protein_length])
previous_start = int(domain[7])
#print(domains_and_gaps)
for idx, region in enumerate(domains_and_gaps):
if not region[0].startswith("PF"):
for domain in sorted_data:
if not domain[5].startswith("PF"):
if int(domain[6]) in range(int(region[2]), int(region[3])):
#print(domain)
if "G" in domains_and_gaps[idx][0][0]:
if domain[5].startswith("mobidb-lite"):
domains_and_gaps[idx][0] = "D"+domains_and_gaps[idx][0]
elif domain[5].startswith("Low"):
domains_and_gaps[idx][0] = "L"+domains_and_gaps[idx][0]
elif domain[5].startswith("Coiled"):
domains_and_gaps[idx][0] = "C"+domains_and_gaps[idx][0]
#rewrite first char in gap
# now loop over the over regions and reannotated the gaps strings
#print(domains_and_gaps)
if len(domains_and_gaps) == 0:
return
output = domains_and_gaps[0][1]+": "
for region in domains_and_gaps:
output += region[0]+" "
output = output.rstrip()
print(output)
previous_uniprot = "XXX"
line_cache = []
with open("/scratch1/NOT_BACKED_UP/dbuchan/interpro/derived/"
"final_domains_E.dat") as pfam:
# "test.dat") as pfam:
first_line = pfam.readline()
first_line = first_line.rstrip()
entries = first_line.split(",")
line_cache.append(first_line)
previous_uniprot = entries[0]
for line in pfam:
line = line.rstrip()
entries = line.split(",")
if entries[0] not in previous_uniprot:
handle_cache(line_cache)
line_cache = []
line_cache.append(line)
previous_uniprot = entries[0]
else:
line_cache.append(line)
handle_cache(line_cache)