forked from reynoldsk/pySCA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotate_MSA.py
executable file
·120 lines (98 loc) · 5.95 KB
/
annotate_MSA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python
"""
The annotate_MSA script provides utilities to automatically annotate sequence headers
(for a fasta file) with taxonomic information. Currently this can be done in one of two ways:
1) For PFAM alignments, annotations can be extracted from the file pfamseq.txt (please download from: ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/database_files)
2) For Blast alignments, annotations can be added using the NCBI Entrez utilities provided by BioPython. In this case, an additional command line argument (--giList, see below) should specify a list of gi numbers. These numbers are then used to query NCBI for taxonomy information (note that this approach requires a network connection).
To quickly extract gi numbers from a list of headers (variable name
'hd') with typical Blast alignment formatting, the following line of python code is useful:
>>> gis = [h.split('_')[1] for h in hd]
Alternatively, the script alnParseGI.py will accomplish this. For both the PFAM and NCBI utilities, the process of sequence annotation *can be slow* (on the order of hours, particularly for NCBI entrez with larger alignments). However, the annotation process only needs to be run once per alignment.
:Arguments:
Input_MSA.fasta (an input sequence alignment)
:Keyword Arguments:
-o, --output Specify an output file, Output_MSA.an
-a, --annot Annotation method. Options are 'pfam' or 'ncbi'. Default: 'pfam'
-g, --giList This argument is necessary for the 'ncbi' method. Specifies a file containing a list of gi numbers corresponding to the sequence order in the alignment; a gi number of "0" indicates that a gi number wasn't assigned for a particular sequence.
-p, --pfam_seq Location of the pfamseq.txt file. Defaults to path2pfamseq (specified at the top of scaTools.py)
:Examples:
>>> ./annotate_MSA.py Inputs/PF00186_full.txt -o Outputs/PF00186_full.an -a 'pfam'
>>> ./annotate_MSA.py Inputs/DHFR_PEPM3.fasta -o Outputs/DHFR_PEPM3.an -a 'ncbi' -g Inputs/DHFR_PEPM3.gis
:By: Rama Ranganathan, Kim Reynolds
:On: 9.22.2014
Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds
This program is free software distributed under the BSD 3-clause
license, please see the file LICENSE for details.
"""
import time, sys
import argparse
import scaTools as sca
from Bio import SeqIO
from Bio import Entrez
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
Entrez.email = "[email protected]" # PLEASE change to your own email !!!
if __name__ == '__main__':
#parse inputs
parser = argparse.ArgumentParser()
parser.add_argument("Input_MSA", help='input sequence alignment')
parser.add_argument("-o", "--output", dest = "output", default='sys.stdout', help="Outputfile name. Default: sys.stdout")
parser.add_argument("-a", "--annot", dest ="annot", default='pfam', help="Annotation method. Options are 'pfam' or 'ncbi'. Default: 'pfam'")
parser.add_argument("-g", "--giList", dest = "giList", default=None, help="This argument is necessary for the 'ncbi' method. Specifies a file containing a list of gi numbers corresponding to the sequence order in the alignment; a gi number of 0 indicates that a gi number wasn't assigned for a particular sequence.")
parser.add_argument("-p","--pfam_seq", dest = "pfamseq", default=None, help="Location of the pfamseq.txt file. Defaults to path2pfamseq (specified at the top of scaTools.py)")
options = parser.parse_args()
if (options.annot != 'pfam') & (options.annot != 'ncbi'):
sys.exit("The option -a must be set to 'pfam' or 'ncbi' - other keywords are not allowed.")
if (options.annot == 'ncbi') & (options.giList == None):
sys.exit("To use NCBI entrez annotation, you must specify a file containing a list of gi numbers (see the --giList argument)")
if (options.annot == 'pfam'):
#Annotate a PFAM alignment
if (options.pfamseq == None):
sca.AnnotPfam(options.Input_MSA, options.output)
else:
sca.AnnotPfam(options.Input_MSA, options.output, options.pfamseq)
else:
#Annotate using GI numbers/NCBI entrez
gi_lines = open(options.giList, 'r').readlines()
gi = [int(k) for k in gi_lines]
gi_blocks=[gi[x:x+200] for x in xrange(0, len(gi), 200)]
taxID = list()
start = time.clock()
for i,k in enumerate(gi_blocks):
taxonList = Entrez.read(Entrez.elink(dbfrom="protein",db="taxonomy",id=k))
for x in range(len(taxonList)):
try:
taxID.append(taxonList[x]["LinkSetDb"][0]["Link"][0]["Id"])
except:
taxID.append('')
end = time.clock()
print "Look up for Tax IDs complete. Time: %f" % (end-start)
# Collect records with lineage information
print "Collecting taxonomy information..."
start = time.clock()
records = list()
for i,k in enumerate(taxID):
try:
handle = Entrez.efetch(db="taxonomy", id=k, retmode="xml")
temp_rec = Entrez.read(handle)
handle.close()
records.append(temp_rec[0])
print "%s" % (temp_rec[0]['Lineage'])
print "%s" % (temp_rec[0]['ScientificName'])
except:
records.append('')
end = time.clock()
print "Look up for taxonomy information complete. Time: %f" % (end-start)
# Write to the output fasta file.
s_records = list()
[hd,seqs] = sca.readAlg(options.Input_MSA)
f = open(options.output, 'w')
for i,k in enumerate(seqs):
try:
hdnew = hd[i]+'|'+records[i]['ScientificName']+'|'+','.join(records[i]['Lineage'].split(';'))
except:
hdnew = hd[i]+'| unknown '
print "Unable to add taxonomy information for seq: %s" % hd[i]
f.write('>%s\n' % hdnew)
f.write('%s\n' % k)
f.close()