-
Notifications
You must be signed in to change notification settings - Fork 3
/
align.py
executable file
·131 lines (118 loc) · 3.87 KB
/
align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Some inspiration from Forrest Bao at http://narnia.cs.ttu.edu/drupal/node/104,
# as such, this code is released under the GNU GPL version 3, although the code
# is entirely my own.
#
# copyright Alec Story, 2011
import numpy
import os
import pickle
MATCH = 2
MISMATCH = -1
GAP = "|"
ZERO_POINT = 0
MATCH_POINT = 1
DEL_POINT = 2
INS_POINT = 3
cache = {}
def load_cache(filename="align_cache.pickle"):
print "loading cache...",
path = os.path.relpath(filename)
f = open(path, 'r')
cache.update(pickle.load(f))
f.close()
print "done."
def save_cache(filename="align_cache.pickle"):
print "saving cache...",
path = os.path.relpath(filename)
f = open(path, 'w')
pickle.dump(cache, f)
f.close()
print "done."
def weight(a_i, b_j):
if a_i == b_j:
return MATCH
else:
return MISMATCH
def abbreviate(a,b):
a = a[0:20]
b = b[0:20]
return (a,b)
def sw_align(a, b):
"""Smith-Waterman alignment
Source: http://en.wikipedia.org/wiki/Smith_Waterman
Aligns a against b
returns: (score, aligned-a, aligned-b, (a start index, b start index), (a
end index, b end index))
"""
a = GAP + a
b = GAP + b
if abbreviate(a,b) in cache:
pass
else:
# Definitions:
# a,b are strings
# strings start with GAP to keep indices the same as algorithm given
# if a_i == b_j then w(a_i, b_j) = w(match)
# if a_i != b_j then w(a_i, b_j) = w(mismatch)
# H(i,j) is the maximum Similarity-Score between a suffix of a[1..i] and
# a suffix of b[1..j]
# w(c,d), c,d in alphabet and '-', '-' is the gap-scoring scheme
# H(i,0) = 0, 0 <= i <= m
m = len(a)
n = len(b)
H = numpy.zeros((m,n))
pointers = numpy.zeros((m,n))
for i in range(1,m):
for j in range(1,n):
s_match = H[i-1,j-1] + weight(a[i],b[j])
s_del = H[i-1, j] + weight(a[i],GAP)
s_ins = H[i, j-1] + weight(GAP,b[j])
H[i,j] = max(
0,
s_match,
s_del,
s_ins
)
# build traceback
if H[i,j] == 0:
pointers[i,j] = ZERO_POINT
elif H[i,j] == s_match:
pointers[i,j] = MATCH_POINT
elif H[i,j] == s_del:
pointers[i,j] = DEL_POINT
else:
pointers[i,j] = INS_POINT
# To obtain the optimum local alignment, we start with the highest value
# in the matrix (i,j).
best = (0,0) # value is always 0
for i in range(1,m):
for j in range(1,n):
if H[i,j] > H[best]:
best = (i,j)
# Then, we go backwards to one of positions (i-1,j), (i,j-1), and
# (i-1,j-1) depending on the direction of movement used to construct the
# matrix. We keep the process until we reach a matrix cell with zero
# value, or the value in position (0,0).
point = best
align_a,align_b = "", ""
while pointers[point] != ZERO_POINT:
i,j = point
if pointers[point] == MATCH_POINT:
align_a = a[i] + align_a
align_b = b[j] + align_b
point = (i-1, j-1)
elif pointers[point] == DEL_POINT:
align_a = a[i] + align_a
align_b = GAP + align_b
point = (i-1, j)
else: #INS_POINT
align_a = GAP + align_a
align_b = b[j] + align_b
point = (i, j-1)
cache[abbreviate(a,b)] = H[best], align_a, align_b, point, best
return cache[abbreviate(a,b)]
if __name__ == "__main__":
load_cache()
print sw_align("albacore", "bacon")
print sw_align("albacore", "baecon")
save_cache()