|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | + |
| 4 | +# Example: <script>.py -i input.gbk -s VFDB_setA_pro -f CDS -q inference -r product |
| 5 | +from argparse import ArgumentParser |
| 6 | +from Bio import SeqIO |
| 7 | + |
| 8 | +def parseArgs(): |
| 9 | + parser = ArgumentParser(description='Parses a GenBank file for data ' |
| 10 | + 'given a search term and specified fields to query. An additional ' |
| 11 | + 'parsed field is also reported in tab-delimited format.', add_help=False) |
| 12 | + req = parser.add_argument_group('Required') |
| 13 | + req.add_argument('-i', '--infile', required=True, metavar='FILE', |
| 14 | + help='input GenBank file') |
| 15 | + req.add_argument('-s', '--query-search', required=True, metavar='STR', |
| 16 | + type=str, help='search term to look for within the query feature ' |
| 17 | + 'and query qualifier') |
| 18 | + opt = parser.add_argument_group('Optional') |
| 19 | + opt.add_argument('-f', '--query-feature', default='CDS', metavar='STR', |
| 20 | + help='genbank feature type to search in, e.g., CDS, gene, rRNA, ' |
| 21 | + 'source, tRNA, misc_feature') |
| 22 | + opt.add_argument('-h', '--help', action='help', |
| 23 | + help='show this help message and exit') |
| 24 | + opt.add_argument('-o', '--outfile', default='parsed.tab', metavar='FILE', |
| 25 | + help='output tab-delimited file containing <locus_tag>\\t' |
| 26 | + '<query-qualifier>\\t<report-qualifier> [./parsed.tab]') |
| 27 | + opt.add_argument('-q', '--query-qualifier', default='inference', |
| 28 | + metavar='STR', help='qualifier term within each genbank feature to ' |
| 29 | + 'search in, e.g., locus_tag, inference, codon_start, product, ' |
| 30 | + 'transl_table, translation') |
| 31 | + opt.add_argument('-r', '--report-qualifier', default='product', |
| 32 | + metavar='STR', help='additional qualifier term to parse data from ' |
| 33 | + 'and report when queries are found') |
| 34 | + return parser.parse_args() |
| 35 | + |
| 36 | +def main(): |
| 37 | + opts = parseArgs() |
| 38 | + gbk = opts.infile |
| 39 | + query_feat = opts.query_feature |
| 40 | + query_qualif = opts.query_qualifier |
| 41 | + query_str = opts.query_search |
| 42 | + rep_qualif = opts.report_qualifier |
| 43 | + outfile = opts.outfile |
| 44 | + |
| 45 | + l = [] |
| 46 | + for rec in SeqIO.parse(open(gbk, 'r'), 'genbank'): |
| 47 | + for feature in rec.features: |
| 48 | + if query_feat == feature.type and \ |
| 49 | + query_qualif in feature.qualifiers: |
| 50 | + list_to_search_against = feature.qualifiers[query_qualif] |
| 51 | + found = [s for s in list_to_search_against if query_str in s] |
| 52 | + if len(found) > 0: |
| 53 | + hit = ' '.join(found) |
| 54 | + l.append('{}\t{}\t{}'.format( |
| 55 | + feature.qualifiers['locus_tag'][0], hit, |
| 56 | + ''.join(feature.qualifiers[rep_qualif][0]))) |
| 57 | + |
| 58 | + with open(opts.outfile, 'w') as ofh: |
| 59 | + for i in l: |
| 60 | + ofh.write('{}\n'.format(i)) |
| 61 | + |
| 62 | +if __name__ == '__main__': |
| 63 | + main() |
0 commit comments