-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGbkParser_fq.py
executable file
·186 lines (171 loc) · 8.68 KB
/
GbkParser_fq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import Bio
from Bio import SeqIO
__author__ = 'mwelland'
__version__ = 1.3
__version_date__ = '11/02/2015'
class GbkParser:
"""
Notes:
Isolated class to deal exclusively with GBK files
Should return dictionary, not write full output
Parses the input file to find all the useful values
This will populate a dictionary to be returned at completion
Dict { full genomic sequence
genename
refseqname
transcripts { transcript { cds_offset
exons { exon_number { genomic_start
genomic_stop
"""
def __init__(self, file_name, padding):
"""
This class is created by instantiating with a file name and a padding value.
These are used to locate the appropriate target file, and to select the
amount of flanking sequence to be appended to exons.
"""
'''
:param file_name: the location/identity of the target input file
:param padding: the required amount of intronic padding
'''
self.genomic = ''
self.padding = padding
self.exons = []
self.cds = []
self.mrna = []
self.fileName = file_name
# Read in the specified input file into a variable
try:
self.transcriptdict = {'transcripts': {},
'input': SeqIO.to_dict(SeqIO.parse(file_name, 'genbank')),
'offset': self.padding,
'filename': file_name}
self.transcriptdict['refseqname'] = self.transcriptdict['input'].keys()[0]
self.is_matt_awesome = True
except IOError as fileNotPresent:
print "The specified file cannot be located: " + fileNotPresent.filename
exit()
@property
def get_version(self):
"""
Quick function to grab version details for final printing
:return:
"""
return 'Version: {0}, Version Date: {1}'.format(str(__version__), __version_date__)
def get_mrna_exons(self):
""" This uses the list of exon start and stop positions to populate
the exon positions in the dictionary"""
for alternative in self.transcriptdict['Alt transcripts']:
alt_dict = {'list_of_exons': [], 'exons': {}}
selected_mrna = self.mrna[alternative-1]
try:
alt_dict['NM_number'] = selected_mrna.qualifiers['transcript_id'][0]
except KeyError:
alt_dict['NM_number'] = self.transcriptdict['genename']
self.transcriptdict['refseqname'] = self.transcriptdict['genename']
self.transcriptdict['genename'] = self.cds[0].qualifiers['gene'][0]
exon = 1
if len(self.exons) == 1:
alt_dict['exons'][exon] = {}
alt_dict['list_of_exons'].append(exon)
start = selected_mrna.location.start
end = selected_mrna.location.end
exon_seq = list(self.genomic[start - self.padding: end + self.padding])
length = len(exon_seq)
minidict = {'genomic_start': start, 'genomic_end': end,
'padded seq': exon_seq, 'length': end - start,
'padded length': length}
alt_dict['exons'][exon] = minidict
else:
subfeatures = selected_mrna._get_sub_features()
for coords in subfeatures:
alt_dict['list_of_exons'].append(exon)
start = coords.location.start
end = coords.location.end
exon_seq = list(self.genomic[start - self.padding: end + self.padding])
length = len(exon_seq)
minidict = {'genomic_start': start, 'genomic_end': end,
'padded seq': exon_seq, 'length': end - start, 'padded length': length}
alt_dict['exons'][exon] = minidict
exon += 1
self.transcriptdict['transcripts'][alternative] = alt_dict
def get_protein(self):
"""
This method takes the CDS tagged block from the GenBank features section and parses the
contents to retrieve the protein sequence. This is added to the appropriate section of
dictionary used to hold all required details of the file.
"""
'''
:param cds: a list containing the cds element(s) of the genbank features
'''
for alternative in self.transcriptdict['Alt transcripts']:
selected_cds = self.cds[alternative-1]
self.transcriptdict['transcripts'][alternative]['protein_length'] = len(selected_cds.qualifiers['translation'][0])*3
self.transcriptdict['transcripts'][alternative]['old_cds_offset'] = selected_cds.location.start
def find_cds_delay(self):
""" Method to find the actual start of the translated sequence
introduced to sort out non-coding exon problems """
for transcript in self.transcriptdict['transcripts']:
offset_total = 0
offset = self.transcriptdict['transcripts'][transcript]['old_cds_offset']
exon_list = self.transcriptdict['transcripts'][transcript]['list_of_exons']
for exon in exon_list:
g_start = self.transcriptdict['transcripts'][transcript]['exons'][exon]['genomic_start']
g_stop = self.transcriptdict['transcripts'][transcript]['exons'][exon]['genomic_end']
if offset > g_stop:
offset_total = offset_total + (g_stop - g_start)
self.transcriptdict['transcripts'][transcript]['exons'][exon]['cds'] = 'before'
elif g_stop > offset >= g_start:
self.transcriptdict['transcripts'][transcript]['cds_offset'] = offset_total + (offset - g_start)
self.transcriptdict['transcripts'][transcript]['exons'][exon]['cds'] = 'after'
elif offset < g_start:
self.transcriptdict['transcripts'][transcript]['exons'][exon]['cds'] = 'after'
def fill_and_find_features(self):
dictionary = self.transcriptdict['input'][self.transcriptdict['refseqname']]
self.genomic = dictionary.seq
features = dictionary.features
for feature in features:
# Multiple exons are expected, not explicitly used
if feature.type == 'exon':
self.exons.append(feature)
""" This section works on the assumption that each exon in the file will use the appropriate gene name
and that the only relevant CDS and mRNA sections will also contain the same accession
"""
try:
self.transcriptdict['genename'] = self.exons[0].qualifiers['gene'][0]
for feature in features:
if feature.type == 'CDS':
if feature.qualifiers['gene'][0] == self.transcriptdict['genename']:
self.cds.append(feature)
elif feature.type == 'mRNA':
if feature.qualifiers['gene'][0] == self.transcriptdict['genename']:
self.mrna.append(feature)
except KeyError:
for feature in features:
if feature.type == 'CDS':
self.cds.append(feature)
elif feature.type == 'mRNA':
self.mrna.append(feature)
note = self.mrna[0].qualifiers['note'][0]
self.transcriptdict['genename'] = note.split('=')[1]
assert len(self.cds) == len(self.mrna), "There are a different number of CDS and mRNA"
return features
def run(self):
"""
This is the main method of the GBK Parser. This method is called after class instantiation
and handles the operation of all the other functions to complete the dictionary which will
hold all of the sequence and exon details of the gene file being parsed
"""
'''
:return transcriptdict: This function fills and returns the dictionary, contents
explained in Class docstring above
'''
print 'BioPython version: ' + str(Bio.__version__)
# initial sequence grabbing and populating dictionaries
self.fill_and_find_features()
self.transcriptdict['full sequence'] = list(self.genomic)
self.transcriptdict['Alt transcripts'] = range(1, len(self.cds)+1)
self.get_mrna_exons()
self.get_protein()
self.find_cds_delay()
del self.transcriptdict['input']
return self.transcriptdict