Skip to content

Commit 2b83719

Browse files
committed
parses annotations from GenBank files
1 parent ea67ddd commit 2b83719

File tree

1 file changed

+63
-0
lines changed

1 file changed

+63
-0
lines changed

Diff for: extract.nfo.from.GBK.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/usr/bin/env python
2+
3+
4+
# Example: <script>.py -i input.gbk -s VFDB_setA_pro -f CDS -q inference -r product
5+
from argparse import ArgumentParser
6+
from Bio import SeqIO
7+
8+
def parseArgs():
9+
parser = ArgumentParser(description='Parses a GenBank file for data '
10+
'given a search term and specified fields to query. An additional '
11+
'parsed field is also reported in tab-delimited format.', add_help=False)
12+
req = parser.add_argument_group('Required')
13+
req.add_argument('-i', '--infile', required=True, metavar='FILE',
14+
help='input GenBank file')
15+
req.add_argument('-s', '--query-search', required=True, metavar='STR',
16+
type=str, help='search term to look for within the query feature '
17+
'and query qualifier')
18+
opt = parser.add_argument_group('Optional')
19+
opt.add_argument('-f', '--query-feature', default='CDS', metavar='STR',
20+
help='genbank feature type to search in, e.g., CDS, gene, rRNA, '
21+
'source, tRNA, misc_feature')
22+
opt.add_argument('-h', '--help', action='help',
23+
help='show this help message and exit')
24+
opt.add_argument('-o', '--outfile', default='parsed.tab', metavar='FILE',
25+
help='output tab-delimited file containing <locus_tag>\\t'
26+
'<query-qualifier>\\t<report-qualifier> [./parsed.tab]')
27+
opt.add_argument('-q', '--query-qualifier', default='inference',
28+
metavar='STR', help='qualifier term within each genbank feature to '
29+
'search in, e.g., locus_tag, inference, codon_start, product, '
30+
'transl_table, translation')
31+
opt.add_argument('-r', '--report-qualifier', default='product',
32+
metavar='STR', help='additional qualifier term to parse data from '
33+
'and report when queries are found')
34+
return parser.parse_args()
35+
36+
def main():
37+
opts = parseArgs()
38+
gbk = opts.infile
39+
query_feat = opts.query_feature
40+
query_qualif = opts.query_qualifier
41+
query_str = opts.query_search
42+
rep_qualif = opts.report_qualifier
43+
outfile = opts.outfile
44+
45+
l = []
46+
for rec in SeqIO.parse(open(gbk, 'r'), 'genbank'):
47+
for feature in rec.features:
48+
if query_feat == feature.type and \
49+
query_qualif in feature.qualifiers:
50+
list_to_search_against = feature.qualifiers[query_qualif]
51+
found = [s for s in list_to_search_against if query_str in s]
52+
if len(found) > 0:
53+
hit = ' '.join(found)
54+
l.append('{}\t{}\t{}'.format(
55+
feature.qualifiers['locus_tag'][0], hit,
56+
''.join(feature.qualifiers[rep_qualif][0])))
57+
58+
with open(opts.outfile, 'w') as ofh:
59+
for i in l:
60+
ofh.write('{}\n'.format(i))
61+
62+
if __name__ == '__main__':
63+
main()

0 commit comments

Comments
 (0)