forked from bio-ontology-research-group/deepgoplus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpfam.py
executable file
·54 lines (48 loc) · 1.56 KB
/
pfam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python
import click as ck
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from subprocess import Popen, PIPE
import time
from utils import Ontology
from aminoacids import to_onehot
import io
import gzip
MAXLEN = 2000
@ck.command()
@ck.option('--pfam-file', '-pf', default='data/Pfam-A.seed', help='Pfam')
def main(pfam_file):
# Load CNN model
pfams = list()
alignments = list()
interpros = list()
with io.open(pfam_file, 'rt', encoding='latin-1') as f:
pfam_id = ''
aligns = list()
ipro_id = ''
for line in f:
line = line.strip()
if line.startswith('#=GF AC '):
if pfam_id != '':
pfams.append(pfam_id)
alignments.append(aligns)
interpros.append(ipro_id)
aligns = list()
ipros = list()
pfam_id = line[10:]
elif line.startswith('#=GF DR INTERPRO;'):
ipro_id = line[20:-1]
elif not line.startswith('#') and not line == '//':
a = line.split()
if len(a) == 2:
aligns.append(a)
else:
print(a)
with open('data/pfam.fa', 'w') as w:
for pfam_id, aligns, ipro_id in zip(pfams, alignments, interpros):
for a in aligns:
w.write(f'>{pfam_id}; {ipro_id}; {a[0]}\n')
w.write(a[1].replace('.','') + '\n')
if __name__ == '__main__':
main()