-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsequence_alignment.py
216 lines (183 loc) · 6.97 KB
/
sequence_alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# Implementation: Aggelos Pikrakis, [email protected]
# Educational material.
# Distributed under the GNU General Public License v2.0
###################################################################################################
import numpy as np
import matplotlib.pyplot as plt
import other_functions
###################################################################################################
def edit_distance(y, x):
# computes the standard edit (Levenstein) distance between two symbol strings
lx = len(x)
ly = len(y)
s = float('inf') * np.ones((ly, lx))
predi = np.ones((ly, lx)) * (-1)
predj = np.ones((ly, lx)) * (-1)
s[0, 0] = int(y[0] != x[0])
# first column
for i in range(1,ly):
s[i, 0] = s[i-1, 0]+ 1
predi[i, 0] = i - 1
predj[i, 0] = 0
# first row
for j in range(1, lx):
s[0, j] = s[0, j-1] + 1
predi[0, j] = 0
predj[0, j] = j - 1
# all other cost grid elements
for i in range(1,ly):
for j in range(1,lx):
q= [s[i-1, j-1]+int(y[i] != x[j]), s[i-1, j]+1, s[i, j-1] + 1]
s[i, j] = np.amin(q)
if np.argmin(q)==0:
predi[i, j] = i - 1
predj[i, j] = j - 1
elif np.argmin(q)==1:
predi[i, j] = i - 1
predj[i, j] = j
else:
predi[i, j] = i
predj[i, j] = j - 1
return s[ly-1, lx-1], s, predi, predj
###################################################################################################
def dtw_distance_sc(y, x, gp_constraints):
# computes the dynamic time warping cost between two sequences of numbers using the Sakoe-Chiba
# local path constraints
lx = len(x)
ly = len(y)
s = float('inf') * np.ones((ly, lx))
predi = np.ones((ly, lx)) * (-1)
predj = np.ones((ly, lx)) * (-1)
s[0, 0] = abs(y[0] - x[0])
# first column
for i in range(1,ly):
if abs(i-0)<=gp_constraints:
s[i, 0] = s[i-1, 0] + abs(y[i] - x[0])
predi[i, 0] = i-1
predj[i, 0] = 0
# first row
for j in range(1, lx):
if abs(0 - j) <= gp_constraints:
s[0, j] = s[0, j-1] + abs(y[0] - x[j])
predi[0, j] = 0
predj[0, j] = j-1
# all other cost grid elements
for i in range(1,ly):
for j in range(1,lx):
if abs(i - j) <= gp_constraints:
q = [s[i-1, j-1], s[i-1, j], s[i, j-1]]
s[i, j] = np.amin(q) + abs(y[i] - x[j])
if np.argmin(q)==0:
predi[i, j] = i - 1
predj[i, j] = j - 1
elif np.argmin(q)==1:
predi[i, j] = i - 1
predj[i, j] = j
else:
predi[i, j] = i
predj[i, j] = j - 1
return s[ly-1, lx-1], s, predi, predj
###################################################################################################
def dtw_distance_ita(y, x, gp_constraints):
# computes the dynamic time warping cost between two sequences of numbers using the Itakura
# local path constraints
lx = len(x)
ly = len(y)
s = float('inf')*np.ones((ly,lx))
predi = np.ones((ly,lx))*(-1)
predj = np.ones((ly, lx))*(-1)
s[0, 0] = abs(y[0] - x[0])
# first row
if abs(0-1)<=gp_constraints:
s[0, 1] = s[0, 0] + abs(y[0]-x[1])
predi[0, 1] = 0
predj[0, 1] = 0
# all other cost grid elements
for i in range(1,ly):
for j in range(1,lx):
if abs(i-j) <= gp_constraints:
offset = 0
if i > 1:
if j > 1 and predi[i, j-1] == i:
q = [s[i - 1, j - 1], s[i - 2, j - 1]]
offset = 1
else:
q = [s[i, j-1], s[i-1, j-1], s[i-2, j-1]]
else:
if j > 1 and predi[i, j-1] == i:
q = [s[i - 1, j - 1]]
offset = 1
else:
q = [s[i, j - 1], s[i - 1, j - 1]]
s[i, j] = np.amin(q) + abs(y[i] - x[j])
predi[i, j] = i - offset - np.argmin(q)
predj[i, j] = j - 1
return s[ly-1, lx-1], s, predi, predj
###################################################################################################
def lcs(y, x):
# computes the length of the longest common subsequence of y and x
lx = len(x)
ly = len(y)
frwrdi = np.ones((ly, lx)) * (-1)
frwrdj = np.ones((ly, lx)) * (-1)
s = np.zeros((ly, lx))
# last column
for i in range(0, ly):
s[i, lx-1] = int(y[i] == x[lx-1])
# last row
for j in range(0, lx):
s[ly-1, j] = int(y[ly-1] == x[j])
for i in range(ly-2, -1, -1):
for j in range(lx-2, -1, -1):
m1 = np.amax(s[i + 1,j + 1:lx])
m2 = np.amax(s[i + 1:ly, j + 1])
if np.amax([m1, m2]) == m1:
w = np.argmax(s[i + 1, j + 1:lx]) + 1
frwrdi[i, j] = i + 1
frwrdj[i, j] = j + w
else:
w = np.argmax(s[i + 1:ly, j + 1]) + 1
frwrdi[i, j] = i + w
frwrdj[i, j] = j+1
s[i, j] = int(y[i] == x[j]) + np.amax([m1,m2])
return np.amax(s), s, frwrdi, frwrdj
###################################################################################################
def backtracking(s, predi, predj):
# performs backtracking on a dissimilarity grid, starting from the node that corresponds to the last element of
# each sequence
v = s.shape
xc = int(v[0]) - 1
yc = int(v[1]) - 1
bp=[]
while (xc > -1) and (yc > -1):
bp.append([xc, yc])
tmpxc = int(predi[xc, yc])
tmpyc = int(predj[xc, yc])
xc = tmpxc
yc = tmpyc
return bp[::-1]
###################################################################################################
def forwardtracking(s, frwrdi, frwrdj):
# performs forward tracking on a similarity grid, starting from the node that has accumulated the maximun similarity
# value. This function can be used in cojunction with the lcs implementation
xc, yc = other_functions.aux_fun(s, np.max)
bp = []
while (xc > -1) and (yc > -1):
bp.append([xc, yc])
tmpxc = int(frwrdi[xc, yc])
tmpyc = int(frwrdj[xc, yc])
xc = tmpxc
yc = tmpyc
return bp
def print_alignment(y, x, bp):
# prints the alignment of y vs x, inserting gaps when needed
L = len(bp)
for k in range(L-1):
print('%3d <----> %3d' % (y[bp[k][0]],x[bp[k][1]]))
if bp[k+1][0] > bp[k][0]+1:
for m in range(bp[k][0]+1, bp[k+1][0]):
print('%3d <----> %3s'% (y[m], '-'))
if bp[k+1][1] > bp[k][1]+1:
for m in range(bp[k][1]+1, bp[k+1][1]):
print('%3s <----> %3d' % ('-', x[m]))
print('%3d <----> %3d' % (y[bp[-1][0]], x[bp[-1][1]]))