-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathBIN-ALI-Similarity.R
313 lines (251 loc) · 10.7 KB
/
BIN-ALI-Similarity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# tocID <- "BIN-ALI-Similarity.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Similarity unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe ([email protected])
#
# Versions:
# 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 Refactored for 2017; add aaindex, ternary plot.
# 0.1 First code copied from 2016 material.
#
#
# TODO:
# Update ggtern:: ternary plot to use aacol dots under text
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------
#TOC> 1 Amino Acid Properties 43
#TOC> 2 Mutation Data matrix 189
#TOC> 3 Background score 230
#TOC>
#TOC> ==========================================================================
# = 1 Amino Acid Properties ===============================================
# A large collection of amino acid property tables is available via the seqinr
# package:
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# Package information:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
# data:
?aaindex
data(aaindex, package = "seqinr") # load the aaindex list from the package
length(aaindex)
# Here are all the index descriptions
for (i in 1:length(aaindex)) {
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
}
# It's a bit cumbersome to search through the descriptions ... here is a
# function to make this easier:
searchAAindex <- function(patt) {
# Searches the aaindex descriptions for regular expression "patt"
# and prints index number and description.
hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
for (i in seq_along(hits)) {
cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
}
}
searchAAindex("free energy") # Search for "free energy"
searchAAindex("(size)|(volume)") # Search for "size" or "volume":
# Let's examine ...
# ... a hydrophobicity index
(Y <- aaindex[[528]][c("D", "I")])
# ... a volume index
(V <- aaindex[[150]][c("D", "I")])
# ... and one of our own: side-chain pK values as reported by
# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
# to 7.4 (physiological pH)
K <- list(I = c( 7.4, # Ala
12.3, # Arg
7.4, # Asn
3.9, # Asp
8.6, # Cys
7.4, # Gln
4.3, # Glu
7.4, # Gly
6.5, # His
7.4, # Ile
7.4, # Leu
10.4, # Lys
7.4, # Met
7.4, # Phe
7.4, # Pro
7.4, # Ser
7.4, # Thr
7.4, # Trp
9.8, # Tyr
7.4)) # Val
names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
"Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
# pull the names from Y$I, convert them to single letter code, and reorder the
# AACOLS palette accordingly ...
aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
plot(Y$I, V$I,
xlab = "hydrophobicity", ylab = "volume",
pch = 21,
cex = 6,
col = aac,
bg = aac)
text(Y$I, V$I, names(Y$I), cex = 0.8)
plot(Y$I, K$I,
xlab = "hydrophobicity", ylab = "pK",
pch = 21,
cex = 6,
col = aac,
bg = aac)
text(Y$I, K$I, names(Y$I), cex = 0.8)
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
# plots are in general unintuitive and hard to interpret. One alternative is a
# so-called "ternary plot":
if (! requireNamespace("ggtern", quietly=TRUE)) {
install.packages("ggtern")
}
# Package information:
# library(help = ggtern) # basic information
# browseVignettes("ggtern") # available vignettes
# data(package = "ggtern") # available datasets
# collect into data frame, normalize to (0.05, 0.95)
myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
"vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
"pK" = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
stringsAsFactors = FALSE)
rownames(myDat) <- names(Y$I)
ggtern::ggtern(data = myDat,
ggplot2::aes(x = vol,
y = phi,
z = pK,
label = rownames(myDat))) + ggplot2::geom_text()
# This results in a mapping of amino acids relative to each other that is
# similar to the Venn diagram you have seen in the notes.
# ... or we could use principal components analysis, to pull out the
# best projection of the three feature dimensions into two. (Done here without delving
# into the theory ...)
prc <- prcomp(myDat)
plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
pch=19, cex=6, col=aad, cex.main=0.7,
main="Principal Component Analysis of Amino Acid Features")
text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
# This matches the intuition rather well in that "similar" amino acids are close
# on the plot. But we can't interpret the distances in terms of just one of the
# parameters. Whatever - nature has a different way to define similarity:
# mutations to similar amino acids are less likely to break the protein.
# = 2 Mutation Data matrix ================================================
# A mutation data matrix encodes all amino acid pairscores in a matrix.
# The Biostrings package contains the most common mutation data matrices.
if (! requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help=Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# Let's attach the BLOSUM62 mutation data matrix from the package
data(BLOSUM62, package = "Biostrings")
# ... and see what it contains. (You've seen this matrix before.)
BLOSUM62
# We can simply access values via the row/column names.
# Identical amino acids have high scores ...
BLOSUM62["H", "H"] # Score for a pair of two histidines
BLOSUM62["S", "S"] # Score for a pair of two serines
# Similar amino acids have low positive scores ...
BLOSUM62["L", "I"] # Score for a leucine / lysine pair
BLOSUM62["F", "Y"] # etc.
# Dissimilar amino acids have negative scores ...
BLOSUM62["L", "K"] # Score for a leucine / lysine pair
BLOSUM62["Q", "P"] # etc.
BLOSUM62["R", "W"] # the matrix is symmetric!
BLOSUM62["W", "R"]
# = 3 Background score ====================================================
# The mutation data matrix is designed to give high scores to homologous
# sequences, low scores to non-homologous sequences. What score on average
# should we expect for a random sequence?
# If we sample amino acid pairs at random, we will get a score that is the
# average of the individual pairscores in the matrix. Omitting the ambiguity
# codes and the gap character:
sum(BLOSUM62[1:20, 1:20])/400
# But that score could be higher for real sequences, for which the amino acid
# distribution is not random. For example membrane proteins have a large number
# of hydrophobic residues - an alignment of unrelated proteins might produce
# positive scores. And there are other proteins with biased amino acid
# compositions, in particular poteins that interact with multiple other
# proteins. Let's test how this impacts the background score by comparing a
# sequence with shuffled sequences. These have the same composition, but are
# obvioulsy not homologous. The data directory contains the FASTA file for the
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
# low-complexity amino acid sequence ...
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
# with an exceptionally high percentage of hydrophobic residues.
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
# Here is a function that takes two sequences and
# returns their average pairscore.
averagePairScore <- function(a, b, MDM = BLOSUM62) {
# Returns average pairscore of two sequences.
# Parameters:
# a, b chr amino acid sequence string
# MDM mutation data matrix. Default is BLOSUM62
# Value: num average pairscore.
a <- unlist(strsplit(a, ""))
b <- unlist(strsplit(b, ""))
v <- 0
for (i in seq_along(a)) {
v <- v + MDM[ a[i], b[i] ]
}
return(v / length(a))
}
orig3FG7 <- toString(aa3FG7)
orig2F1C <- toString(aa2F1C)
N <- 1000
scores3FG7 <- numeric(N)
scores2F1C <- numeric(N)
for (i in 1:N) {
scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
}
# Plot the distributions
hist(scores3FG7,
col="#5599EE33",
breaks = seq(-1.5, 0, by=0.1),
main = "Pairscores for randomly shuffled sequences",
xlab = "Average pairscore from BLOSUM 62")
hist(scores2F1C,
col="#55EE9933",
breaks = seq(-1.5, 0, by=0.1),
add = TRUE)
abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
legend('topright',
c("3FG7 (villin)", "2F1C (OmpG)"),
fill = c("#5599EE33", "#55EE9933"), bty = 'n',
inset = 0.1)
# This is an important result: even though we have shuffled significantly biased
# sequences, and the average scores trend above the average of the mutation data
# matrix, the average scores still remain comfortably below zero. This means
# that we can't (in general) improve a high-scoring alignment by simply
# extending it with randomly matched residues. We will only improve the score if
# the similarity of newly added residues is larger than what we expect to get by
# random chance!
# [END]