-
Notifications
You must be signed in to change notification settings - Fork 92
/
find_orf.py
308 lines (260 loc) · 10.5 KB
/
find_orf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#! /usr/bin/env python3
import sys
import re
def vet_nucleotide_sequence(sequence):
"""
Return None if `sequence` is a valid RNA or DNA sequence, else raise exception.
Parameters
----------
sequence : str
A string representing a DNA or RNA sequence (upper or lower-case)
Returns
-------
None
Return nothing (None) if sequence is valid, otherwise raise an
exception.
Examples
--------
>>> vet_nucleotide_sequence('ACGTACGT') == None
True
>>> vet_nucleotide_sequence('not a valid sequence')
Traceback (most recent call last):
...
Exception: Invalid sequence: 'not a valid sequence'
Don't allow mixing of DNA and RNA!
>>> vet_nucleotide_sequence('AUTGC')
Traceback (most recent call last):
...
Exception: Invalid sequence: 'AUTGC'
Don't allow whitespace (or other characters) before, within, or after!
>>> vet_nucleotide_sequence(' ACGT ACGT ')
Traceback (most recent call last):
...
Exception: Invalid sequence: ' ACGT ACGT '
But, an empty string should be deemed valid
>>> vet_nucleotide_sequence('') == None
True
"""
##########################################################################
############################ EDIT CODE BELOW #############################
# `rna_pattern_str` and `dna_pattern_str` need to be regular expressions
# that will match any string of zero or more RNA and DNA bases,
# respectively (and only strings of zero or more RNA and DNA bases).
# Currently, `rna_pattern_str` and `dna_pattern_str` are strings of literal
# characters.
# These are valid regular expressions, but they will only match their
# respective strings exactly.
# Change `rna_pattern_str` and `dna_pattern_str` so that they will match
# any valid RNA and DNA sequence strings, respectively (and only strings of
# RNA and DNA bases).
# Read the docstring above for additional clues.
rna_pattern_str = r'AUCG'
dna_pattern_str = r'ATCG'
##########################################################################
rna_pattern = re.compile(rna_pattern_str)
dna_pattern = re.compile(dna_pattern_str)
if rna_pattern.match(sequence):
return
if dna_pattern.match(sequence):
return
else:
raise Exception("Invalid sequence: {0!r}".format(sequence))
def vet_codon(codon):
"""
Return None if `codon` is a valid RNA codon, else raise an exception.
Parameters
----------
codon : str
A string representing a codon (upper or lower-case)
Returns
-------
None
Return nothing (None) if codon is valid, otherwise raise an
exception.
Examples
--------
Valid codon
>>> vet_codon('AUG') == None
True
lower-case is also vaild
>>> vet_codon('aug') == None
True
DNA is not valid
>>> vet_codon('ATG')
Traceback (most recent call last):
...
Exception: Invalid codon: 'ATG'
A codon must be exactly 3 RNA bases long
>>> vet_codon('AUGG')
Traceback (most recent call last):
...
Exception: Invalid codon: 'AUGG'
"""
##########################################################################
############################ EDIT CODE BELOW #############################
# `codon_pattern_str` needs to be a regular expression that will match any
# codon (but only a string that is one codon).
# Currently, `codon_pattern_str` is only a string of literal characters.
# This is a valid regular expression, but it will only match 'AUG' exactly.
# Change `codon_pattern_str` so that it will match any valid codons, and
# only valid codons.
# Read the docstring above for additional clues.
codon_pattern_str = r'AUG'
##########################################################################
codon_pattern = re.compile(codon_pattern_str)
if codon_pattern.match(codon):
return
else:
raise Exception("Invalid codon: {0!r}".format(codon))
def find_first_orf(sequence,
start_codons = ['AUG'],
stop_codons = ['UAA', 'UAG', 'UGA']):
"""
Return the first open-reading frame in the DNA or RNA `sequence`.
An open-reading frame (ORF) is the part of an RNA sequence that is
translated into a peptide. It must begin with a start codon, followed by
zero or more codons (triplets of nucleotides), and end with a stop codon.
If there are no ORFs in the sequence, an empty string is returned.
Parameters
----------
sequence : str
A string representing a DNA or RNA sequence (upper or lower-case)
start_codons : list of strings
All possible start codons. Each codon must be a string of 3 RNA bases,
upper or lower-case.
stop_codons : list of strings
All possible stop codons. Each codon must be a string of 3 RNA bases,
upper or lower-case.
Returns
-------
str
An uppercase string of the first ORF found in the `sequence` that
starts with any one of the `start_codons` and ends with any one of the
`stop_codons`. If no ORF is found an empty string is returned.
Examples
--------
When the whole RNA sequence is an ORF:
>>> find_first_orf('AUGGUAUAA', ['AUG'], ['UAA'])
'AUGGUAUAA'
When the whole DNA sequence is an ORF:
>>> find_first_orf('ATGGTATAA', ['AUG'], ['UAA'])
'AUGGUAUAA'
When there is no ORF:
>>> find_first_orf('CUGGUAUAA', ['AUG'], ['UAA'])
''
When there is are bases before and after ORF:
>>> find_first_orf('CCAUGGUAUAACC', ['AUG'], ['UAA'])
'AUGGUAUAA'
"""
# Make sure the sequence is valid
vet_nucleotide_sequence(sequence)
# Make sure the codons are valid
for codon in start_codons:
vet_codon(codon)
for codon in stop_codons:
vet_codon(codon)
# Get copies of everything in uppercase
seq = sequence.upper()
starts = [c.upper() for c in start_codons]
stops = [c.upper() for c in stop_codons]
# Make sure seq is RNA
seq = seq.replace('T', 'U')
##########################################################################
############################ EDIT CODE BELOW #############################
# `orf_pattern_str` needs to be a regular expression that will match an
# open reading frame within a string of RNA bases. At this point we know
# the string only contains uppercase A, C, G, and U.
# I recommend starting by hardcoding the standard start and stop codons
# (the ones listed as defaults for this function) into the regular
# expression. After you get that working, then try generalizing it to work
# for any start/stop codons.
# Currently, `orf_pattern_str` is only a string of literal characters. This
# is a valid regular expression, but it will only match 'AUGGUAUAA'
# exactly. Change `orf_pattern_str` so that it will match any open reading
# frame.
# Read the docstring above for additional clues.
orf_pattern_str = r'AUGGUAUAA'
##########################################################################
# Create the regular expression object
orf_pattern = re.compile(orf_pattern_str)
# Search the sequence
match_object = orf_pattern.search(seq)
if match_object:
return match_object.group()
return ''
def parse_sequence_from_path(path):
# Try to open the path to read from it, and handle exceptions if they
# arrise
try:
file_stream = open(path, 'r')
except FileNotFoundError as e:
sys.stderr.write("Sorry, couldn't find path {}".format(path))
raise e
except IsADirectoryError as e:
sys.stderr.write("Sorry, path {} appears to be a directory".format(
path))
raise e
except:
sys.stderr.write("Sorry, something went wrong when trying to open {}".format(
path))
raise
# If we've reached here, the file is open and ready to read
sequence = ''
# A for loop to visit each line in the file
for line in file_stream:
# Strip whitespace from the line and concatenate it to the end of the
# sequence
sequence += line.strip()
return sequence
def main():
import argparse
# Create a command-line parser object
parser = argparse.ArgumentParser()
default_start_codons = ['AUG']
default_stop_codons = ['UAA', 'UAG', 'UGA']
# Tell the parser what command-line arguments this script can receive
parser.add_argument('sequence',
metavar = 'SEQUENCE',
type = str,
help = ('The sequence to search for an open-reading frame. '
'If the path flag (\'-p\'/\'--path\') is specified, '
'then this should be a path to a file containing the '
'sequence to be searched.'))
parser.add_argument('-p', '--path',
action = 'store_true',
help = ('The sequence argument should be treated as a path to a '
'containing the sequence to be searched.'))
parser.add_argument('-s', '--start-codon',
type = str,
action = 'append', # append each argument to a list
default = None,
help = ('A start codon. This option can be used multiple times '
'if there are multiple start codons. '
'Default: {0}.'.format(" ".join(default_start_codons))))
parser.add_argument('-x', '--stop-codon',
type = str,
action = 'append', # append each argument to a list
default = None,
help = ('A stop codon. This option can be used multiple times '
'if there are multiple stop codons. '
'Default: {0}.'.format(" ".join(default_stop_codons))))
# Parse the command-line arguments into a 'dict'-like container
args = parser.parse_args()
# Check to see if the path option was set to True by the caller. If so, parse
# the sequence from the path
if args.path:
sequence = parse_sequence_from_path(args.sequence)
else:
sequence = args.sequence
# Check to see if start/stop codons were provided by the caller. If not,
# use the defaults.
if not args.start_codon:
args.start_codon = default_start_codons
if not args.stop_codon:
args.stop_codon = default_stop_codons
orf = find_first_orf(sequence = sequence,
start_codons = args.start_codon,
stop_codons = args.stop_codon)
sys.stdout.write('{}\n'.format(orf))
if __name__ == '__main__':
main()