-
Notifications
You must be signed in to change notification settings - Fork 1
/
Node Class - Exercise 6b.py
492 lines (436 loc) · 24.2 KB
/
Node Class - Exercise 6b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
"""
Exercise 6 - Creating and Using Node and Tree Classes
@author: jembrown
Below is the beginning of a Node class definition and a simple example of how
to link nodes to form a tree. Use this as a springboard to start thinking about:
- What other attributes of a Node might we like to store?
- How do we define a Tree class? What attributes should it have?
- Can you write a function to print out a parenthetical tree string
(e.g., ((spA,spB),spC)) if the only argument passed to the function is a
root node? This will require recursion.
"""
# ---> Defining Node and Tree classes <---
class Node:
def __init__(self,name="",parent=None,children=None):
self.name = name
self.parent = None
if children is None:
self.children = []
else:
self.children = children
# ---> Creating and linking nodes <---
# Creating nodes to build this simple three-taxon tree: ((spA,spB),spC)
# spA spB spC
# \ / /
# \ / /
# \/ /
# \ /
# \ /
# \/
# |
# Define the root node to start. It currently has no parents or children.
root = Node("root")
# Define a node for species C. It is a direct descendant of the root.
spC = Node("SpeciesC",parent=root)
root.children.append(spC) # Adds spC as a child of the root
# Define a node for the ancestor of species A and B, descending from the root.
ancAB = Node("ancAB",parent=root)
root.children.append(ancAB)
spA = Node("SpeciesA",parent=ancAB) # Creates spA with ancAB as its parent.
spB = Node("SpeciesB",parent=ancAB) # Creates spB with ancAB as its parent.
ancAB.children.append(spA)
ancAB.children.append(spB)
print("ancAB's children: ")
for child in ancAB.children:
print child.name
print("")
print("root's children: ")
for child in root.children:
print child.name
# Play around with nodes and see if you can build more complicated trees!
###Playing with the Golden-green Woodpecker tree - (P. leucolaemus(P. flavigula(P. aurulentus,P. chrysochloros)))
root = Node("root")
leucolaemus = Node ("leucolaemus",parent=root)
root.children.append(leucolaemus)
ancflav_aur_chry=Node("ancflav_aur_chry",parent=root)
root.children.append(ancflav_aur_chry)
flavigula=Node("flavigula",parent=ancflav_aur_chry)
ancflav_aur_chry.children.append(flavigula)
ancaur_chry=Node("ancaur_chry",parent=ancflav_aur_chry)
ancflav_aur_chry.children.append(ancaur_chry)
aurulentus=Node("aurulentus",parent=ancaur_chry)
ancaur_chry.children.append(aurulentus)
chrysochloros=Node("chrysochloros",parent=ancaur_chry)
ancaur_chry.children.append(chrysochloros)
print("ancaur_chry children: ")
for child in ancaur_chry.children:
print child.name
print("root's children: ")
for child in root.children:
print child.name
print("ancflav_aur_chry children: ")
for child in ancflav_aur_chry.children:
print child.name
# Eventually, we will want to create a Tree class, where a parenthetical tree
# string is passed as an argument to the constructor and it automatically creates
# all the nodes and links them together. Start thinking about how to do that.
# Let's go ahead and define a Tree object that houses all these nodes and
# organizes methods associated with them.
import scipy
from numpy.random import uniform
from math import log
from math import exp
from itertools import tee, islice, chain, izip
import operator
import functools
from scipy import linalg
import numpy as np
import re
##Using the continous time Markov Chain Simulator to generate
class ContMarkSim(object):
def __init__ (self,NodeChain=1,v=10,Q=[[-1.916,0.541,0.787,0.588],[0.148,-1.069,0.417,0.506],[0.286,0.170,-0.591,0.135],[0.525,0.236,0.594,-1.355]],waittimes=[],list4=[], states=[],TotalProbs=[],MargProb=[],Likelihood=[]):
self.NodeChain=NodeChain
self.v=v##Ending Time, in this case I chose 10
self.Q=Q##Q matrix, or Rates matrix
##Tuple with my state space
self.waittimes=[]##This will be the list with the waiting times
self.list4=[]##This will be the list with the states (in numbers where 0=A,1=C,2=G and 3=T)
self.states=states##This will give you the states as nucleotides.
self.TotalProbs=TotalProbs
self.MargProb=MargProb
self.Likelihood=Likelihood
##Defining the function that will make my simulation
def MarkSim(self):
##defining the function of discrete sample, to sort the events to change
def discSamp(events,probs):
ranNum = scipy.random.random()
cumulProbs = []
cumulProbs.extend([probs[0]])
for t in range(1,len(probs)):
cumulProbs.extend([probs[t]+cumulProbs[-1]])
for t in range(0,len(probs)):
if ranNum < cumulProbs[t]:
return events[t]
return None
A = np.squeeze(np.asarray(self.Q))##this transforms my Q matrix in an array A
StationaryProbs=linalg.expm(A*1000)##getting the matrix with stationary probability.
list6=[]
list7=[]
list10=[]
E=0##Defining the starting point in the branch
i=self.NodeChain
self.list4=[i]
M4=StationaryProbs[1][i]
##this for loop will calculate the transition matrix for each state from the Q matrix
for d in range(len(self.Q)):
list11=[]
for g in self.Q[d]:
if g > 0:
p=-(g/(self.Q[d][d]))
else:
p = 0
list11.append(p)
list10.append(list11)
T= list10##this is the transition matrix
##this fucntion will be used to set two lists, one with the current sampled states, and another with the respective next states. This will be used to calculate the probabilities of each state change.
def previous_and_next(some_iterable):
prevs, items, nexts = tee(some_iterable, 3)
prevs = chain([None], prevs)
nexts = chain(islice(nexts, 1, 0), [0])
return izip(prevs, items, nexts)
while E < self.v:##this loop will occur while the sum of the waiting times does not reach the ending time
list2=[]
u1=uniform(low=0.0, high=1.0, size=None)##sorting an uniform value to give to the next equation
Wtime=-(1/-(self.Q[i][i]))*log(u1)##calculating the waiting time, by assuming that lambda is equal to the -diagonal (-Qii) value
self.waittimes.append(Wtime)##putting the waiting times in list 1
E=sum(z for z in self.waittimes)##adding the waiting times to make the simulation stop when reaching the ending time
for g in self.Q[i]:
if g > 0:
p=-(g/(self.Q[i][i]))##calculating the probabilities of the next events according to the current state. We divide the positive rates of the row, by the diagonal value
else:
p = 0
list2.append(p)##puting the probabilities in a list
for s in self.waittimes:
exponPDF = -(self.Q[i][i]) * exp(- (-(self.Q[i][i]))*s)##this function calculates the exponential probability density function for each waiting time according to the current state
list6.append(exponPDF)##this list keeps exponPDF values
M1=functools.reduce(operator.mul, list6, 1)## this list has the multiplication of all exponPDF values
i=discSamp(events=[0,1,2,3],probs=list2)##sampling the next state according to the current state
self.list4.append(i)##appending the states in a list
list67=[]
list68=[]
for previous, item, nxt in previous_and_next(self.list4):
list67.append(item)##this list will keep the previous sampled nucleotide
list68.append(nxt)##this list will keep the subsequent nucleotide
for c,m in zip (list67,list68):##this loop will give the right values to transition matrix values
if m != list68[-1]:
statesprobs=T[c][m]##this will give the probabilities of going from the previous state to the next
list7.append(statesprobs)##this will append the probabilities of each state in list7
else:
pass
M2=functools.reduce(operator.mul, list7, 1)#this will multiply the probabilities of sampling each state
MargProbs=linalg.expm(A*self.v)##this calculates the marginal probabilities
finaltime=self.waittimes[-1]
finalstate=self.list4[-1]
cdffinaltime=1-(2.71826**(-(self.Q[finalstate][finalstate])*finaltime))##calculating the probability of last waiting time without a particular change in it.
M3=1-cdffinaltime
self.MargProb=MargProbs[self.list4[0]][self.list4[-1]]
self.TotalProbs=M4*M1*M2*M3##this multiplies the probabilities for each waiting time and for the each nucleotide, in other words, the probability of all the events that occurred in my simulation
self.states = [str(h) for h in self.list4]
self.states = [z.replace('0', 'A') for z in self.states]## Converting the numbers to letters representing nucleotides
self.states = [a.replace('1', 'C') for a in self.states]## Converting the numbers to letters representing nucleotides
self.states = [u.replace('2', 'G') for u in self.states]## Converting the numbers to letters representing nucleotides
self.states = [n.replace('3', 'T') for n in self.states]## Converting the numbers to letters representing nucleotides
return self.states,self.waittimes,self.TotalProbs,self.MargProb##returning the states(substitutions) and waiting times until reaching an ending time
def discSamp(events,probs):
ranNum = scipy.random.random()
cumulProbs = []
cumulProbs.extend([probs[0]])
for t in range(1,len(probs)):
cumulProbs.extend([probs[t]+cumulProbs[-1]])
for t in range(0,len(probs)):
if ranNum < cumulProbs[t]:
return events[t]
return None
# We'll need this later, I promise. It's always better to put import statements
# outside of class definitions. In fact, it's best to put them all at the top
# of a file. This imports the ctmc class that we previously defined.
class MyTree:
"""
Defines a class of phylogenetic tree, consisting of linked Node objects.
"""
def __init__(self):
"""
The constructor really needs to be more flexible, but for now we're
going to define the whole tree structure by hand. This just uses
the same statements we used above. By next Thurs (3/19), see if you can
write a constructor that takes a parenthetical tree as its argument and
builds the corresponding tree in memory.
"""
self.root = Node("root")
self.spC = Node("SpeciesC",parent=self.root)
self.root.children.append(self.spC)
self.ancAB = Node("ancAB",parent=self.root)
self.root.children.append(self.ancAB)
self.spA = Node("SpeciesA",parent=self.ancAB)
self.spB = Node("SpeciesB",parent=self.ancAB)
self.ancAB.children.append(self.spA)
self.ancAB.children.append(self.spB)
# Now, let's add branch lengths to our Node objects (remember, these fields
# can be added arbitrarily in Python). In the future, we should probably include
# branch lengths in the Node constructor.
self.spA.brl = 0.1
self.spB.brl = 0.1
self.spC.brl = 0.2
self.ancAB.brl = 0.1
self.root.brl = 0
# We're also going to add lists to each node that will hold simulated
# sequences.
self.spA.seq = []
self.spB.seq = []
self.spC.seq = []
self.ancAB.seq = []
self.root.seq = []
# Write a recursive function that takes the root node as its only argument and
# prints out all the names of the terminal nodes in the tree. Due next Tues (3/17).
def printNames(self,node):
"""
A method of a Tree object that will print out the names of its
terminal nodes.
"""
if len(node.children) > 0:#if the node has children, or is not a terminal
for child in node.children:#dealing with the child these nodes
self.printNames(child)
else:
print node.name#this will print the names of terminals
# Write a recursive function to calculate the total tree length (the sum of
# all the branch lengths). Again, the root node of a tree should be the only
# argument the first time this function is called. Due next Tues (3/17).
def treeLength(self,node):
"""
A method to calculate and return total tree length.
"""
sumbrl=0
if node.children is not None:
for child in node.children:
sumbrl += self.treeLength(child)###this sums the value of branch length and assigns the new value to the sum
return node.brl + sumbrl
# Write a recursive function that takes the root node as one of its arguments
# and prints out a parenthetical (Newick) tree string. Due next Tues (3/17).
def newickLists(self,node):##this function gives a tree with relationships in a list of lists
"""
A method of a Tree object that will print out the Tree as a
parenthetical List.
"""
listoflists=[]
if len(node.children) > 0:
for child in node.children:
d=self.newickLists(child)
listoflists.append(d)#you just append each node as one list inside another
else:
listoflists.append(node.name)
return listoflists###instead of a string this returns a list of lists, I know this could be a problem in the future, but for now the relationships of the nodes are mantained
##This is the recursive function that will print the parenthetical tree in a string
##Based on Subir's code
def newickstring(self,node):
"""
A method of a Tree object that will print out the Tree as a
parenthetical string (Newick format).
"""
parenthetical = "("
if len(node.children) == 0:#dealing with the terminals
return node.name + ":" + str(node.brl)
else:
for child in node.children:
if node.children[-1] == child:
parenthetical += self.newickstring(child)
else:
parenthetical += self.newickstring(child) + ","
if node.brl is not 0:
parenthetical += "):" + str(node.brl)
else:
parenthetical += ")"
return parenthetical
# Now, let's write a recursive function to simulate sequence evolution along a
# tree. This amounts to simply simulating evolution along each branch
# from the root towards the tips. We'll need to use our ctmc class for setting the
# conditions of our simulation, which is why we imported it above our tree
# class definition. In this case, we've stored the definition of our ctmc
# class in a separate file (ctmc.py) to keep our tree code compact.
# Now, let's add a ctmc object to each internal node in our tree (except the
# root). Again, it would be best to add the ctmcs as part of the Node
# constructor, if we know that we'll be simulating data.
# Try to get this simulator and associated functions working by next Thurs. (3/19)
"""
#I did not use this function. My simulation in a tree is in the next two functions
This method of a Tree object defines a ctmc object associated with all
nodes that have a branch length (i.e., all but the root).
def setModels(self,node):
if node.children is not None:
for child in node.children:
if child.brl > 0:
ContMark=CTMC()
return ContMark
"""
def RootState(self):##This function draws a starting state from the stationary frequencies.
Q=[[-1.916,0.541,0.787,0.588],[0.148,-1.069,0.417,0.506],[0.286,0.170,-0.591,0.135],[0.525,0.236,0.594,-1.355]]
A = np.squeeze(np.asarray(Q))
StationaryProbs=linalg.expm(A*1000)##getting the matrix with stationary probability.
StartingPointvalue=discSamp(events=[0,1,2,3],probs=StationaryProbs[1])
return StartingPointvalue## 0 corresponds to A, 1 corresponds to C, 2 corresponds to G and 3 to T
#This is the simulator:
def simulate(self,node,StartingPoint):##This function simulates markov chains in each node of my tree
"""
This method simulates evolution along the branches of a tree, taking
the root node as its initial argument.
"""
if node.parent is None:##dealing with the root
for child in node.children:
chain=ContMarkSim(v=child.brl,NodeChain=StartingPoint)##the first starting point will come from the RootState function
chain.MarkSim()
node.chain = chain.list4[-1]##this gives me the last nucleotide of the chain
print chain.list4
self.simulate(node=child,StartingPoint=node.chain)##recursion starting from the last state of the previous node
else:
for child in node.children:##dealing with internal nodes
chain=ContMarkSim(v=child.brl,NodeChain=node.parent.chain)##the first state will be the last one of the parent chain
chain.MarkSim()
node.chain = chain.list4[-1]
print chain.list4
self.simulate(node=child,StartingPoint=node.chain)
#the chains are in numbers (0=A;1=C;2=G;3=T)
def buildtree (self,newick_tree):
import ete2 ##This module has a set of functions that do well with newick structures. As I did not have time to come up with a new fucntion to transform a newick object in a tree, I used the built-in function. However, as soon as I have more time I will work on an original funtion to solve this problem.
# Loads a tree structure from a newick string. The returned variable ’t’ is the root node for the tree.
string = str (newick_tree)
t = ete2.Tree(string)
print t##will print the tree structure, but the t object has all the relationships
##The following function will start from a newick string and will give all the relationships between nodes and branch lenghths
def ReadNewick (self,data,base,space = None, extVar = None):
if base == "root" :#the content of this if statement will deal with the sons of the root
root = Node("root")
root.brl = "0"
extVar = root#creating a value that will be returned
son1 = data.partition('(')[-1].rpartition(')')[0]#striping the first parenthesis
regEx1 = re.compile(r'(.*?)\(.*\)')#separating values inside from outside parenthesis
result1 = re.findall(regEx1, son1)#another few steps to separate values inside and outside the parenthesis
result1=result1.pop(0)
result2 = son1.replace(result1,"")
result1 = result1[:-1]
result1=result1.split(",")
result1.append(result2)#both values (root children) are now in a list
for obj in result1:#for each child in this list
branchname=obj.rpartition(":")[0]#let's grab the value before the ":", because this is the name of the node
child=Node(name=branchname,parent="root")#this establishes the relationships of these nodes with the root
print "node name:",child.name #printing the sons of the root
root.children.append(child)
brl=obj.rpartition(":")[-1]#getting the branch length for each child
child.brl=brl
print "branch length:",child.brl #and their branch length
child.parent = root
for item in root.children:
if item.name[0]=="(":#if the child starts with a parenthesis, this means the child is an internal node, so we do a recursion to work on this node:
self.ReadNewick(data=item.name,base="x",space=item)#making the recursion
return extVar#this will return the root
else:#now we are dealing with internal nodes which first character is an oppen parenthesis
son2 = data.partition('(')[-1].rpartition(')')[0]#striping the parenthesis
regEx1 = re.compile(r'(.*?)\(.*\)')#separating values inside and outside the parenthesis
result3 = re.findall(regEx1, son2)
if result3 != [] and result3 != ['']:#if there are still values outside the parenthesis for example in tree (A,(B,(C,D))), when you strip the first parenthesis you will have A and (B,(C,D))
result3=result3.pop(0)#just some steps to separate values inside and outside the parenthesis
result4 = son2.replace(result3,"")
result3=result3.split(",")
result3 = result3[:-1]
result3.append(result4)#putting the values in a list
for obj in result3:#for each object in this list (the children of the node)
branchname=obj.rpartition(":")[0]#grab the node name
child=Node(name=branchname,parent=data)#this establishes the relationships between the values in the list and its parent
print "node name:", child.name #this will give the child of each internal node
space.children.append(child)#dealing with space (space in the memory instead of node.name)
brl=obj.rpartition(":")[-1]#grabing the branch length
child.brl=brl
print "branchlength:", child.brl #this will give the branch length for each internal node
child.parent=space
if child.name[0]=="(":#if some of the child still is inside a parenthesis, make a recursion for the child
self.ReadNewick(data=child.name,base="x",space=child)
else:# this will deal with nodes with no values outside parenthesis... for example, in a tree (A,(B,(C,D))), this will deal with the node (C,D)
if son2.count("(") > 0:#this will deal with polytomies
son3=son2.split(",(")
for value in son3:
if value[0]!="(":
value1 = son3.pop(son3.index(value))
value2 = "(" + value1
son3.append(value2)
for obj in son3:
branchname=obj.rpartition(":")[0]
child=Node(name=branchname,parent=data)
space.children.append(child)
brl=obj.rpartition(":")[-1]
child.brl=brl
print "node name:", child.name
print "branch length:", child.brl
child.parent=space
self.ReadNewick(data=child.name,base="x",space=child)
else:#this will deal with (C,D) for example and the next steps will deal with the most internal nodes.
son3 = son2.split(",")
for obj in son3:
branchname=obj.rpartition(":")[0]
child=Node(name=branchname,parent=data)
space.children.append(child)
brl=obj.rpartition(":")[-1]
child.brl=brl
print "node name:", child.name
print "branch length:", child.brl
child.parent=space
d=MyTree()
d.printNames(node=d.root)
d.treeLength(node=d.root)
d.newickLists(node=d.root)##This gives a list of lists and not a string...=P
stringTree=d.newickstring(node=d.root)
print stringTree
d.RootState()
d.simulate(node=d.root,StartingPoint=d.RootState())##printing the sequences for each node.
d.buildtree(newick_tree="(A:1,(B:1,(E:1,D:1):0.5):0.5);")##printing the tree structure from any newick objetc.
banana="(A:1,B:0.5,(C:0.1,((D:0.2,E:1):0.5):1):0.6)"
x=d.ReadNewick(data=banana,base="root",space=None)
d.newickLists(node=x)
d.newickstring(node=x)