-
Notifications
You must be signed in to change notification settings - Fork 0
/
Biology Meets Programming - Updated Greedy Motif.py
339 lines (273 loc) · 12 KB
/
Biology Meets Programming - Updated Greedy Motif.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# coding: utf-8
# In[1]:
##Yousuf's Method. The input is meant to be a list of equal-sized k-mers and the goal is figure out the total of number of
## nucleotide counts at each posiiton and return that as a dictionary of lists (each list is a different nucleotide)
def Count(Motifs):
#I am creating a dictionary of nucleotides which will contain a list of nt frequency at each position
count = {"A": [], "C": [], "G": [], "T": []}
#d represents the number of different sequences I want to find the motif in
d = len(Motifs)
#k represents the length of the kmer I'm looking at... since all of them are the same length I'm just looking at
#the first one
k = len(Motifs[0])
#I am first iterating through the positions of each motif, starting at position zero and ending with the final position k
for i in range(k):
A = 0
T = 0
C = 0
G = 0
#I am iterating through every string at position i and will add up how many nucleotides go to what then append the
#number to the list
for kmer_number,kmer in enumerate(Motifs):
if kmer[i] == "A":
A +=1
if kmer[i] == "T":
T +=1
if kmer[i] == "C":
C +=1
if kmer[i] == "G":
G+=1
if kmer_number == len(Motifs)-1:
count["A"].append(A)
count["T"].append(T)
count["C"].append(C)
count["G"].append(G)
return count
# In[2]:
##Using the hints/advice given to me by the program, I will design this second
def Count2(Motifs):
#Making an empty dictionary
count = {}
#Creating the dictionary with the appropriate number of zeros in every position for every nucleotide already inputted
#k is th length of the motif. Each motif is the same length so we can just use the first one
#The list iterates through the 4 nucleotides of life, ACGT and creates an empty list [] with the key being the symbol
#Once an empty list is created with the key as the nucleotide symbol, a list of zeros is added
#The number of zeros corresponds to k
k = len(Motifs[0])
for symbol in "ACGT":
count[symbol] = []
for j in range(k):
count[symbol].append(0)
#t is the number of motifs we need t count
t = len(Motifs)
#iterating from 0 the the max number of sequences
for i in range(t):
#For each motif, we will iterate through the length
for j in range(k):
#We are going to read what the nucleotide is at the motif we're looking at --> i and what position --> j
symbol = Motifs[i][j]
#add +1 to the dictionary
count[symbol][j] += 1
return count
# In[3]:
#This is a program that will normalize the nucleotide counts at each position to a decimal frequency
def Profile(Motifs):
#number of motifs we have to count
t = len(Motifs)
#length of each motif..all of them are the same length so we just count the length of the first one
k = len(Motifs[0])
#getting our raw nucleotide counts from the count program
count = Count(Motifs)
#setting up a new dictionary using the technique from count2 where all the positions are 0's
normalized_count = {}
for symbol in "ACGT":
normalized_count[symbol] = []
for j in range(k):
normalized_count[symbol].append(0)
#iterating through all the positions starting from 0
for position in range(k):
total = 0
#count the total number of nucleotide counts at each position
for nt in "ACGT":
total += count[nt][position]
#divide the raw nucleotide total by the total number of all nucleotides
for nt in "ACGT":
normalized_count[nt][position] = float(count[nt][position])/total
return normalized_count
# In[4]:
#This is a program that will give me back a consensus sequence based on count
def Consensus(Motifs):
consensus = ""
k = len(Motifs[0])
count = Count(Motifs)
for position in range(k):
max = 0
max_nt = ''
for nt in "ACGT":
if count[nt][position] > max:
max = count[nt][position]
max_nt = nt
consensus += max_nt
return consensus
# In[5]:
#This is a program that will score the consensus sequence based on how many nucleotides DO NOT match the consensus sequence
def Score(Motifs):
consensus = Consensus(Motifs)
count = Count(Motifs)
k = len(Motifs[0])
score = 0
for position in range(k):
current_nt = consensus[position]
for symbol in "ACGT":
if symbol != current_nt:
score += count[symbol][position]
return score
# In[6]:
#This program will calculate the probability of obtaining a k-mer given the output(probability matrix) from Profile
#See Biology Meets Programming - Motif Counter if confused
def Pr(Text, Profile):
k = len(Text)
probability = 1
matrix = Profile
for position in range(k):
probability*= Profile[Text[position]][position]
return probability
# In[7]:
#This program will find the MOST likely k-mer given the: k-mer size, the profile matrix, and sequence we want to search through
def ProfileMostProbablePattern(text, k, profile):
#Since this sequence is circular, we need to add a bit of the beginning of the sequence to end for long motifs
extendedGenome = text + text[0:len(text)//2]
#just initializing some variables
best_prob = -1
best_kmer = extendedGenome[0:k]
#So first I'll be iterating through each position
for i in range(len(text)-k+1):
#Checks probability of k-mer i to i+k
current_prob = Pr(extendedGenome[i:i+k], profile)
#if probability is better than previous one, replace.
if current_prob > best_prob:
best_prob = current_prob
best_kmer = extendedGenome[i:i+k]
return best_kmer
# In[8]:
#This program will find the best k-mer given: list of sequence strings, size of k-mer, number of sequences to search through
def GreedyMotifSearch(Dna, k, t):
#The best motif is set as the first k-mer of each sequence, so you have a list of t-number of k-mers
BestMotifs = []
for i in range(0, t):
BestMotifs.append(Dna[i][0:k])
#Now n is the length of the first sequence I have
n = len(Dna[0])
#i will be the starting position of where I will be drawing my k-mers from
for i in range(n-k+1):
#Empty list creation
Motifs = []
#I add a k-mer of appropriate length from the first sequence string to the list Motifs
Motifs.append(Dna[0][i:i+k])
#iterating from 0 to the number of sequences you have
for j in range(1, t):
#P is equal to the probability matrix generated by profile, which will count how similar nucleotides are and return
#it as a percentage of A's or T's at a position. This will only have up to t-number of k-mers in it
P = Profile(Motifs[0:j])
#Add the most similar k-mer from the following sequences 1,2,3...t to this motifs list
Motifs.append(ProfileMostProbablePattern(Dna[j], k, P))
#Now we will check to see if the score of our generated 5 k-mers is better than the previous one. Thus we end up
#eventually with the 5 best k-mers
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs
# In[9]:
#TestSet
Dna = [
'GACCTACGGTTACAACGCAGCAACCGAAGAATATTGGCAA',
'TCATTATCGATAACGATTCGCCGGAGGCCATTGCCGCACA',
'GGAGTCTGGTGAAGTGTGGGTTATGGGGCAGACTGGGAAA',
'GAATCCGATAACTGACACCTGCTCTGGCACCGCTCTCATC',
'AAGCGCGTAGGCGCGGCTTGGCATCTCGGTGTGTGGCCAA',
'AATTGAAAGGCGCATCTTACTCTTTTCGCTTAAAATCAAA',
'GGTATAGCCAGAAAGCGTAGTTAATTTCGGCTCCTGCCAA',
'TCTGTTGTTGCTAACACCGTTAAAGGCGGCGACGGCAACT'
]
print(GreedyMotifSearch(Dna, 4, 8))
#Output:
# CGCA
# CGCA
# GGAG
# GGCA
#GGCA
#CGCA
#GGTA
#GGCA
# In[ ]:
####The code below this is for revising the previous greedy motif method into a better one
# In[ ]:
#This is a revised method of counting, since the original method suffers from Cromwell's problem i.e. assigning something a
#probability of zero is not the best way to create a set... everything should have a nonzero percent chance of happening,
#no matter how unlikely
def CountWithPseudocounts(Motifs):
#Making an empty dictionary
count = {}
#Creating the dictionary with the appropriate number of zeros in every position for every nucleotide already inputted
#k is th length of the motif. Each motif is the same length so we can just use the first one
#The list iterates through the 4 nucleotides of life, ACGT and creates an empty list [] with the key being the symbol
#Once an empty list is created with the key as the nucleotide symbol, a list of zeros is added
#The number of zeros corresponds to k
k = len(Motifs[0])
for symbol in "ACGT":
count[symbol] = []
for j in range(k):
count[symbol].append(1)
#t is the number of motifs we need t count
t = len(Motifs)
#iterating from 0 the the max number of sequences
for i in range(t):
#For each motif, we will iterate through the length
for j in range(k):
#We are going to read what the nucleotide is at the motif we're looking at --> i and what position --> j
symbol = Motifs[i][j]
#add +1 to the dictionary
count[symbol][j] += 1
return count
# In[10]:
#Revised ProfileCount
def ProfileWithPseudocounts(Motifs):
#number of motifs we have to count
t = len(Motifs)
#length of each motif..all of them are the same length so we just count the length of the first one
k = len(Motifs[0])
#getting our raw nucleotide counts from the count program
count = CountWithPseudocounts(Motifs)
#setting up a new dictionary using the technique from count2 where all the positions are 0's
normalized_count = {}
for symbol in "ACGT":
normalized_count[symbol] = []
for j in range(k):
normalized_count[symbol].append(0)
#iterating through all the positions starting from 0
for position in range(k):
total = 0
#count the total number of nucleotide counts at each position
for nt in "ACGT":
total += count[nt][position]
#divide the raw nucleotide total by the total number of all nucleotides
for nt in "ACGT":
normalized_count[nt][position] = float(count[nt][position])/total
return normalized_count
# In[ ]:
#Updated GreedyCount
def GreedyMotifSearchWithPseudocounts(Dna, k, t):
#The best motif is set as the first k-mer of each sequence, so you have a list of t-number of k-mers
BestMotifs = []
for i in range(0, t):
BestMotifs.append(Dna[i][0:k])
#Now n is the length of the first sequence I have
n = len(Dna[0])
#i will be the starting position of where I will be drawing my k-mers from
for i in range(n-k+1):
#Empty list creation
Motifs = []
#I add a k-mer of appropriate length from the first sequence string to the list Motifs
Motifs.append(Dna[0][i:i+k])
#iterating from 0 to the number of sequences you have
for j in range(1, t):
#P is equal to the probability matrix generated by profile, which will count how similar nucleotides are and return
#it as a percentage of A's or T's at a position. This will only have up to t-number of k-mers in it
P = ProfileWithPseudocounts(Motifs[0:j])
#Add the most similar k-mer from the following sequences 1,2,3...t to this motifs list
Motifs.append(ProfileMostProbablePattern(Dna[j], k, P))
#Now we will check to see if the score of our generated 5 k-mers is better than the previous one. Thus we end up
#eventually with the 5 best k-mers
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs
# In[12]: