-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathBIN-PHYLO-Tree_analysis.R
407 lines (320 loc) · 15.3 KB
/
BIN-PHYLO-Tree_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
# tocID <- "BIN-PHYLO-Tree_analysis.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
#
# Version: 2.0
#
# Date: 2017 - 10 - 2022 - 11
# Author: Boris Steipe ([email protected])
#
# Versions:
# 2.0 2022 The PHYLIP era is over
# 1.2 2020 updates. Deprecate iTol and use taxize:: instead.
# Rewrite of tip re-ordering. Better handling of
# messages. pBar() for randomization.
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0.2 Typo in variable name, style changes
# 1.0.1 Wrong section heading
# 1.0 First 2017 version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------
#TOC> 1 Preparation and Tree Plot 52
#TOC> 2 SPECIES REFERENCE TREE 68
#TOC> 3 Tree Analysis 124
#TOC> 3.1 Visualizing your tree 126
#TOC> 3.2 Rooting Trees 185
#TOC> 3.3 Rotating Clades 220
#TOC> 3.4 Computing tree distances 308
#TOC>
#TOC> ==========================================================================
# = 1 Preparation and Tree Plot ===========================================
if (! requireNamespace("ape", quietly = TRUE)) {
install.packages("ape")
}
# Package information:
# library(help = ape) # basic information
# browseVignettes("ape") # available vignettes
# data(package = "ape") # available datasets
# We change the graphics parameters from time to time, let's define the
# default so we can recreate a sane state:
dev.off()
PAR <- par()
# = 2 SPECIES REFERENCE TREE ==============================================
# Before we do any kind of phylogenetic analysis of genes from several species,
# we MUST have a reference tree of the taxonomic relationships in hand. This
# context is absolutely required for the interpretation of our tree.
# We have the tax-ids in our database, and the NCBI has the species tree - we
# just need some way to extract the subtree that corresponds to our taxons of
# interest. Here's how to use the taxize:: package.
if (! requireNamespace("taxize", quietly = TRUE)) {
install.packages("taxize")
}
# Package information:
# library(help = taxize) # basic information
# browseVignettes("taxize") # available vignettes
# data(package = "taxize") # available datasets
( mySOI <- c(myDB$taxonomy$ID, "83333") )
myClass <- taxize::classification(mySOI, db = "ncbi")
# Note: You will get a warning about not providing an authentication
# key - but the data object should have been computed anyway. For this
# lightweight task, you do not need a key.
str(myClass)
myClass[[1]]
fungiTree <- taxize::class2tree(myClass, check = TRUE)
plot(fungiTree)
# The tree produced by taxize:: contains full length species names,
# but it would be more convenient if it had bicodes instead. Also, the actual
# tree is only part of the list(), which will cause problems later:
str(fungiTree)
# ... we therefore simplify the object
fungiTree <- fungiTree$phylo
str(fungiTree)
# The species names are in a vector $phylo$tip.label of this list.
# We can use biCode() to shorten them.
fungiTree$tip.label <- biCode(fungiTree$tip.label)
# Plot the tree
nSP <- length(fungiTree$tip.label)
plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
ape::nodelabels(text = fungiTree$node.label,
cex = 0.6,
adj = 0.2,
bg = "#D4F2DA")
# Note that you can use the arrow buttons in the menu above the plot pane to
# scroll back to plots you have created earlier - so you can reference back to
# this species tree in your later analysis.
# = 3 Tree Analysis =======================================================
# == 3.1 Visualizing your tree =============================================
# The trees that are produced by Rphylip are stored as an object of class
# "phylo". This is a class for phylogenetic trees that is widely used in the
# community, practically all R phylogenetics packages will options to read and
# manipulate such trees. Outside of R, a popular interchange format is the
# Newick_format that you have seen above. It's easy to output your calculated
# trees in Newick format and visualize them elsewhere.
# The "phylo" class object is one of R's "S3" objects and methods to plot and
# print it have been defined in ape::. You can simply call plot(<your-tree>) and
# R knows what to do with <your-tree> and how to plot it. The underlying
# function is plot.phylo(), and documentation for its many options can by found
# by typing:
?plot.phylo
# We load the APSES sequence tree that you produced in the
# BIN-PHYLO-Tree_building unit:
apsTree <- ape::read.tree("data/apsesphyloset_phy_phyml_tree.txt")
plot(apsTree) # default type is "phylogram"
plot(apsTree, type = "unrooted")
plot(apsTree, type = "fan", no.margin = TRUE)
# Rescale to show all of the labels:
# record the current plot parameters by assigning them to a variable ...
(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
# ... and adjust the plot limits for a new plot:
plot(apsTree,
type = "fan",
x.lim = tmp$x.lim * 1.8,
y.lim = tmp$y.lim * 1.8,
cex = 0.8,
no.margin = TRUE)
# Inspect the tree object
str(apsTree)
apsTree$tip.label
apsTree$edge
apsTree$edge.length
# show the node / edge and tip labels on a plot
plot(apsTree)
ape::nodelabels()
ape::edgelabels()
ape::tiplabels()
# show the number of nodes, edges and tips
ape::Nnode(apsTree)
ape::Nedge(apsTree)
ape::Ntip(apsTree)
par(PAR) # reset graphics state
# Finally, write the tree to console in Newick format
ape::write.tree(apsTree)
# == 3.2 Rooting Trees =====================================================
# In order to analyse the tree, it is helpful to root it first and reorder its
# clades. By default, PhyML returns an unrooted tree.
ape::is.rooted(apsTree)
# You can root the tree with the command root() from the "ape" package.
plot(apsTree)
# add labels for internal nodes and tips
ape::nodelabels(cex = 0.5, frame = "circle")
ape::tiplabels(cex = 0.5, frame = "rect")
# The outgroup of the tree (KILA ESCCO) is tip "6" in my sample tree, it may be
# a different number in yours. Substitute the correct node number below for
# "outgroup".
apsTree <- ape::root(apsTree, outgroup = 6, resolve.root = TRUE)
plot(apsTree, cex = 0.7)
ape::is.rooted(apsTree)
# Let's add a label for the fungi MRCA
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
# This procedure does however not assign an actual length to a root edge, and
# therefore no root edge is visible on the plot. Why? you might ask. I ask
# myself that too. We'll just add a length by hand.
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
plot(apsTree, cex = 0.7, root.edge = TRUE)
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
# == 3.3 Rotating Clades ===================================================
# To interpret the tree, it is useful to rotate the clades so that they appear
# in the order expected from the cladogram of species.
# We can either rotate around individual internal nodes ...
layout(matrix(1:2, 1, 2))
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
# Note that the species at the bottom of the clade descending from node
# 17 is now plotted at the top.
par(PAR) # reset graphics state
# ... or we can rearrange the tree so it corresponds as well as possible to a
# predefined tip ordering. Here we use the ordering that taxize:: has inferred
# from the NCBI taxonomic classification.
nOrg <- length(apsTree$tip.label)
plot(fungiTree,
no.margin = FALSE, root.edge = TRUE)
ape::nodelabels(text = fungiTree$node.label,
cex = 0.5,
adj = 0.2,
bg = "#D4F2DA")
# These are the fungi tree tips ...
fungiTree$tip.label
# ... and their order is determined by the edge-list that is stored in
fungiTree$edge
# which edges join the tips?
ape::tiplabels(cex = 0.5, frame = "rect")
# As you can see, the tips (range [1:nOrg] ) are in column 2 and they are
# ordered from bottom to top. And each tip number is the index of the species in
# the tip.label vector. So we can take column 2, subset it, and use it to get a
# list of species in the order of the tree ...
sel <- fungiTree$edge[ , 2 ] <= nOrg
( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
# Now, here are the genes of the apsTree tips ...
apsTree$tip.label
# ... and the "constraint" we need for reordering, according to the help page
# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
oSp <- gsub("^", "MBP1_", oSp)
( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
# Then we can plot the two trees to compare: the fungi- tree
par(PAR) # reset graphics state
layout(matrix(1:2, 1, 2))
plot(fungiTree,
no.margin = TRUE,
root.edge = TRUE)
ape::nodelabels(text = fungiTree$node.label,
cex = 0.5,
adj = 0.2,
bg = "#D4F2DA")
# and the re-organized apsesTree ...
plot(ape::rotateConstr(apsTree, constraint = oSp[]),
no.margin = TRUE,
root.edge = TRUE)
par(PAR) # reset graphics state
# As you can see, the reordering is not perfect, since the topologies are
# different, mostly due to the unresolved nodes in the reference tree. One
# could play with that ...
# Task: Study the two trees and consider their similarities and differences.
# What do you expect? What do you find? Note that this is not a "mixed"
# gene tree yet, since it contains only a single gene for the species
# we considered. All of the branch points in this tree are speciation
# events. Thus the gene tree should have the same topology as the
# species tree. Does it? Are the differences important? How many
# branches would you need to remove and reinsert elsewhere to get the
# same topology as the species tree?
# In order to quantify how different these two trees are, we need to compute
# tree distances.
# == 3.4 Computing tree distances ==========================================
# Many superb phylogeny tools are contributed by the phangorn:: package.
if (! requireNamespace("phangorn", quietly = TRUE)) {
install.packages("phangorn")
}
# Package information:
# library(help = phangorn) # basic information
# browseVignettes("phangorn") # available vignettes
# data(package = "phangorn") # available datasets
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
# "KILA_" from the existing tip labels in a copy of our APSES domain tree. (Note
# that this can only work if both the number and labels of taxa are identical in
# both trees.)
apsTree2 <- apsTree
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
# phangorn:: provides several functions to compute tree-differences (and there
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
# "symmetric difference"
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
# Numbers. What do they mean? How much more similar is our apsTree to the
# (presumably) ground truth of fungiTree than a random tree would be?
# The ape package provides the function rtree()
# to compute random trees.
ape::rtree(n = length(apsTree2$tip.label), # number of tips
rooted = TRUE, # we rooted the tree above,
# and fungiTree is rooted anyway
tip.label = apsTree2$tip.label, # use the apsTree2 labels
br = NULL) # don't generate branch lengths since
# fungiTree has none, so we can't
# compare them anyway.
# (Note the warning message about non-binary trees; we'll suppress that later
# by wrapping the function call in supressMessages(); we don't want to
# print it 10,000 times :-)
# Let's compute some random trees this way, calculate the distances to
# fungiTree, and then compare the values we get for apsTree2. The random
# trees are provided by ape::rtree().
N <- 10000 # takes about 15 seconds, and we'll use the pBar function,
# defined in .utilities.R to keep track of where we are at:
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
colnames(myTreeDistances) <- c("symm", "path")
set.seed(112358)
for (i in 1:N) {
pBar(i, N)
xTree <- ape::rtree(n = length(apsTree2$tip.label),
rooted = TRUE,
tip.label = apsTree2$tip.label,
br = NULL)
myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
}
set.seed(NULL) # reset the random number generator
table(myTreeDistances[, "symm"])
( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
par(PAR) # reset graphics state
hist(myTreeDistances[, "path"],
col = "aliceblue",
main = "Distances of random Trees to fungiTree")
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
abline(v = pathObs, col = "chartreuse")
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
(sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
# Indeed, our apsTree is _very_ much more similar to the species tree than
# we would expect by random chance.
# What do we gain from that analysis? Analyzing the tree we get from a single
# gene of orthologous sequences is a positive control in our computational
# experiment. If these genes are indeed orthologues, a correct tree-building
# program ought to give us a tree that exactly matches the species tree.
# Evaluating how far off we are from the known correct result gives us a way to
# validate our workflow and our algorithm. If we can't get that right, we can't
# expect to get "real" data right either. Employing such positive controls in
# every computational experiment is essential for research. Not doing so is
# Cargo Cult Bioinformatics.
# [END]