-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenbankacc2gbk.py
executable file
·114 lines (104 loc) · 4.19 KB
/
genbankacc2gbk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
import os
import re
import sys
import time
from argparse import ArgumentParser
from Bio import Entrez
def parseArgs():
parser = ArgumentParser(description='Given accession(s), retrieve and '
'save a GenBank file.', add_help=False, epilog='If more than one '
'accession is provided, all will be merged into a single output file')
req = parser.add_argument_group('Required')
req.add_argument('acc', metavar='accession', nargs='+',
help='accession number(s) to retrieve')
opt = parser.add_argument_group('Optional')
opt.add_argument('-h', '--help', action='help',
help='show this help message and exit')
opt.add_argument('-l', '--min-length', type=int, metavar='INT',
default=7500, help='minimum record length (base pairs) to save '
'without error warnings [7500]')
opt.add_argument('-o', '--outfile', type=str, metavar='FILE',
default=None, help='output GenBank filename [<acc>.gbk] ')
return parser.parse_args()
def multi_gbk(acc_list, minlen, gbk):
fetched = Entrez.efetch(db='nucleotide', id=acc_list,
rettype='gbwithparts', retmode='text').read().rstrip('\n')
print('\tfound {} GenBank records'.format(
str(len(fetched.split('//\n\n')))))
if len(fetched) < minlen:
sys.exit('ERROR: suspiciously short GenBank record')
contigs = []
for record in fetched.split('\n\n'):
if acc_list[0] not in record:
sys.exit('ERROR: improperly retrieved accession {}'.format(
acc_list[0]))
del acc_list[0]
contig = record.rstrip('\n').split('\n')
i = contig.index('ORIGIN ')
contig.insert(i, 'CONTIG') #enables biopython to SeqIO parse as 'gb'
contigs.extend(contig)
with open(gbk, 'w') as of:
of.write('\n'.join(s for s in contigs))
def get_gbk(accession, minlen, gbk):
fetched = Entrez.efetch(db='nucleotide', id=accession,
rettype='gbwithparts', retmode='text').read()
if accession not in fetched:
sys.exit('ERROR: unable to retrieve accession number {}'.format(
accession))
if 'WGS ' in fetched: #entry is the master record for a whole
# genome shotgun sequencing (WGS) project and lacks sequence data
wgs_acc_list = []
for line in fetched.split('\n'): #handles >1 WGS entry such as NZ_AWGN00000000.1
if line.startswith('WGS '):
if '-' in line: #single range specified such as NZ_MDJV00000000.1
wgs_accs = line.lstrip('WGS ').split('-')
wgs_acc_pref_first = re.sub('\d', '', wgs_accs[0])
wgs_acc_pref_last = re.sub('\d', '', wgs_accs[1])
if wgs_acc_pref_first != wgs_acc_pref_last:
sys.exit('ERROR: unable to parse WGS info')
wgs_acc_suff_first = int(re.sub('\D', '', wgs_accs[0]))
wgs_acc_suff_last = int(re.sub('\D', '', wgs_accs[1]))
acc_suffs = range(wgs_acc_suff_first, wgs_acc_suff_last + 1, 1)
wgs_acc_list.extend( [wgs_acc_pref_first +
str('{num:08d}'.format(num=s)) for s in acc_suffs] )
else: #just one accession specified such as NZ_JPNX00000000.2
wgs_acc_list.append(line.split()[1])
print('\t{} accession is an incomplete chromosome.\n'
'\tRetrieving {} individual contigs...'.format(
accession, str(len(wgs_acc_list))))
time.sleep(3) #be nice to NCBI before doing a second efetch request
multi_gbk(wgs_acc_list, minlen, gbk)
elif len(fetched) < minlen:
sys.exit('{}\nERROR: suspiciously short record for {}'.format(
fetched, accession))
else:
with open(gbk, 'w') as of:
of.write(fetched)
print('\tfound {}'.format(accession))
def main():
opts = parseArgs()
acc = opts.acc
minlen = opts.min_length
Entrez.email = '[email protected]'
# just in case an accession was passed as a comma-separated string, split
if any(',' in a for a in acc):
split_accessions = []
for accessions_string in acc:
for accession in accessions_string.split(','):
split_accessions.append(accession)
acc = split_accessions
# handle outfile naming
if opts.outfile is not None:
out = os.path.realpath(opts.outfile)
elif len(acc) == 1:
out = os.path.join(os.getcwd(), '{}.gbk'.format(acc[0]))
elif len(acc) > 1:
out = os.path.join(os.getcwd(), '{}.gbk'.format(','.join(acc)))
# fetch genbank record(s) and save as single file
if len(acc) == 1:
get_gbk(acc[0], minlen, out)
elif len(acc) > 1:
multi_gbk(acc, minlen, out)
if __name__ == '__main__':
main()