-
Notifications
You must be signed in to change notification settings - Fork 1
/
Function Likelihood in a Tree.py
189 lines (175 loc) · 10.4 KB
/
Function Likelihood in a Tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 19:03:59 2015
@author: Glaucia
"""
from scipy import linalg
import numpy as np
import re
class Node:
def __init__(self,name="",parent=None,children=None):
self.name = name
self.parent = None
if children is None:
self.children = []
else:
self.children = children
extVar = None
###This function reads a newick and determines the relationship among nodes. If you want more details or comments, go to the Node Class - Exercise6b
def ReadNewick (data,base,space = None, extVar = None):
if base == "root" :#the content of this if statement will deal with the sons of the root
root = Node("root")
root.brl = "0"
extVar = root
son1 = data.partition('(')[-1].rpartition(')')[0]
regEx1 = re.compile(r'(.*?)\(.*\)')
result1 = re.findall(regEx1, son1)
result1=result1.pop(0)
result2 = son1.replace(result1,"")
result1 = result1[:-1]
result1=result1.split(",")
result1.append(result2)
for obj in result1:
branchname=obj.rpartition(":")[0]
child=Node(name=branchname,parent="root")
print child.name #printing the sons of the root
root.children.append(child)
brl=obj.rpartition(":")[-1]
child.brl=brl
print child.brl #and their brancj length
child.parent = root
print child.parent #parent
for item in root.children:
if item.name[0]=="(":
ReadNewick(data=item.name,base="x",space=item)
return extVar
else:
son2 = data.partition('(')[-1].rpartition(')')[0]
regEx1 = re.compile(r'(.*?)\(.*\)')
result3 = re.findall(regEx1, son2)
if result3 != [] and result3 != ['']:
result3=result3.pop(0)
result4 = son2.replace(result3,"")
result3=result3.split(",")
result3 = result3[:-1]
result3.append(result4)
for obj in result3:
branchname=obj.rpartition(":")[0]
child=Node(name=branchname,parent=data)
print child.name #this will give the child of each internal node
space.children.append(child)
brl=obj.rpartition(":")[-1]
child.brl=brl
print child.brl #this will give the branch length for each internal node
child.parent=space
print child.parent #parent
if child.name[0]=="(":
ReadNewick(data=child.name,base="x",space=child)
else:
if son2.count("(") > 0:
son3=son2.split(",(")
print son3
for value in son3:
if value[0]!="(":
value1 = son3.pop(son3.index(value))
value2 = "(" + value1
son3.append(value2)
print son3
for obj in son3:
branchname=obj.rpartition(":")[0]
child=Node(name=branchname,parent=data)
space.children.append(child)
brl=obj.rpartition(":")[-1]
child.brl=brl
print child.name
print child.brl
child.parent=space
print child.parent #parent
ReadNewick(data=child.name,base="x",space=child)
else:
son3 = son2.split(",")
print son3,"teste"
for obj in son3:
branchname=obj.rpartition(":")[0]
child=Node(name=branchname,parent=data)
space.children.append(child)
brl=obj.rpartition(":")[-1]
child.brl=brl
print child.name
print child.brl
child.parent=space
print child.parent #parent
SomeTree="(A:1,(B:0.1,(C:0.2,D:1):1):0.6)"
x=ReadNewick(data=SomeTree,base="root",space=None)
##just a function to associate some nucleotides with the tips of a tree, this function is not flexible. I coul make it more flexible and able to deal with other kinds of trees and sequences, but I will leave it like this for now just to help to calculate the likelihood
def printNucl(node):
"""
A method of a Tree object that will print out the names of its
terminal nodes.
"""
if len(node.children) > 0:
node.nucl = []
for child in node.children:
printNucl(child)
else:
if node.name == "A":
node.nucl=[1,0,0,0]#terminal A has nucleotide A
elif node.name == "B":
node.nucl=[1,0,0,0]#terminal B has nucleotide A
elif node.name == "C":
node.nucl=[1,0,0,0]#terminal C has nucleotide A
elif node.name == "D":
node.nucl=[1,0,0,0]#terminal D has nucleotide A
print node.nucl
return node
x = printNucl(x)
######################################
#####This function calculates de likelihood of a Tree for one nucleotide, starting with just the root and a Q matrix
#####If you want to deal with the likelihood for several nucleotides, you can multipliy the results of likelihood obtained in each tree
######################################
def TreeLikelihood (node,Q):
array = np.squeeze(np.asarray(Q))##this transforms my Q matrix in an array A
if len(node.children) > 0 and node.nucl == []:##I made this if statement to get from the root to the tips of the tree. node.nucl represents a list of 4 values, where each value is the probabilitie of having a nucleotide at that site.
for child in node.children:
TreeLikelihood(child,Q)#listA,listC,listG,listT)
else:
listA=[]#this list will keep the information on the probabilities of having an A at each internal node
listC=[]#this list will keep the information on the probabilities of having an C at each internal node
listG=[]#this list will keep the information on the probabilities of having an G at each internal node
listT=[]#this list will keep the information on the probabilities of having an T at each internal node
if node.name != "root":#to deal with all the other nodes except for the root
for x in node.parent.children:#dealing with the children of the node's parent, in other words, the sisters of the node
print "node name:", x.name#just checking the node
print "node branch length:",x.brl,#checking its branch length
print "probabilities of each nucleotide for this node:",x.nucl#checking the presence of that list with the probabilities for each nucleotide
if x.nucl != []:#if the node has some information on the probabilities of each nucleotide do the folowing calculations:
MP=linalg.expm(array*float(x.brl))#getting the marginal probabilities in accordance with the node branch length
x.resA=MP[0][0]*x.nucl[0]+MP[0][1]*x.nucl[1]+MP[0][2]*x.nucl[2]+MP[0][3]*x.nucl[3]#Paa*Pa+Pac*Pc+Pag*Pg+Pat*Pt
x.resC=MP[1][0]*x.nucl[0]+MP[1][1]*x.nucl[1]+MP[1][2]*x.nucl[2]+MP[1][3]*x.nucl[3]#Pca*Pa+Pcc*Pc+Pcg*Pg+Pct*Pt
x.resG=MP[2][0]*x.nucl[0]+MP[2][1]*x.nucl[1]+MP[2][2]*x.nucl[2]+MP[2][3]*x.nucl[3]#Pga*Pa+Pgc*Pc+Pgg*Pg+Pgt*Pt
x.resT=MP[3][0]*x.nucl[0]+MP[3][1]*x.nucl[1]+MP[3][2]*x.nucl[2]+MP[3][3]*x.nucl[3]#Pta*Pa+Ptc*Pc+Ptg*Pg+Ptt*Pt
listA.append(x.resA)#putting the results in a list for A
listC.append(x.resC)#putting the results in a list for C
listG.append(x.resG)#puttinh the results in a list for G
listT.append(x.resT)#putting the results in a list for T
if len(listA)>1:##After doing for both sisters, the list will have more than one result.
squareA=reduce(lambda z, y: z*y,listA)#so we just need to multipliy the values inside the list to have the probabilities for the internal node.
if len(listC)>1:##doing this for all the nucleotides
squareC=reduce(lambda z, y: z*y,listC)
if len(listG)>1:
squareG=reduce(lambda z, y: z*y,listG)
if len(listT)>1:
squareT=reduce(lambda z, y: z*y,listT)
lista = [squareA,squareC,squareG, squareT]
x.parent.nucl=lista##than we can associate the probabilities for each nucleotide to the parent of the node (an internal node)
print "probability for each nucleotide associated with the node's parent:",x.parent.nucl
if x.parent.nucl != []:#now that the parent has some probabilities associated with it we just need to do the recursion now using the parent node
TreeLikelihood (node.parent,Q)#recursion with the parent node
else:
pass
else:#to deal with the root
StatProb=linalg.expm(array*100)#we just need to calculate the stationary probabilities using a high value as branch length
likelihood=node.nucl[0]*StatProb[0][0]+node.nucl[1]*StatProb[0][1]+node.nucl[2]*StatProb[0][2]+node.nucl[3]*StatProb[0][3]#and multipliy the probabilities associated with the root by the respective stationary probability and sum all the values obtained
print "LIKELIHOOD:",likelihood #this will give the likelihood
##As all the terminals have A as a nucleotide the last list (before multiplying the values by the stationary probs), you can see that the value corresponding to the probability of having an A in the root is much bigger than the other values: [0.0031342687620121074, 0.00031325924070299733, 0.00047299874512964405, 0.00093586514844217763]
TreeLikelihood(node=x,Q=[[-1.916,0.541,0.787,0.588],[0.148,-1.069,0.417,0.506],[0.286,0.170,-0.591,0.135],[0.525,0.236,0.594,-1.355]])