-
Notifications
You must be signed in to change notification settings - Fork 7
/
decision.py
196 lines (152 loc) · 5.05 KB
/
decision.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
from pprint import pprint
from scipy.stats import chi2, chi2_contingency
from sys import argv
import numpy as np
import pandas as pd
file_name = argv[1]
# alpha values for chi square pruning
if len(argv) > 2:
alpha = float(argv[2])
else:
alpha = 0.05 # 5%
def partition(a):
"""takes values of an attribute as parameter and splits it based on values
params:
a (list):
list of values of attribute
return:
(dict):
dictionary where keys are various values of attribute and value is array of position where the values occurs
"""
return {c: (a==c).nonzero()[0] for c in np.unique(a)}
def entropy(s):
"""takes values of an attribute as parameter and returns entropy
params:
s (list):
list of values of attribute
return:
res (float):
Entropy value of the attribute
"""
res = 0
val, counts = np.unique(s, return_counts=True)
freqs = counts.astype('float')/len(s)
for p in freqs:
if p != 0.0:
res -= p * np.log2(p)
return res
def information_gain(y, x):
"""takes values of an attribute as parameter and returns entropy
params:
y (list):
list of values of class
x (list):
list of values of attribute
return:
res (float):
Information gain of the attribute
"""
res = entropy(y)
# We partition x, according to attribute values x_i
val, counts = np.unique(x, return_counts=True)
freqs = counts.astype('float')/len(x)
# We calculate a weighted average of the entropy
for p, v in zip(freqs, val):
res -= p * entropy(y[x == v])
return res
def is_pure(s):
"""takes values of class as parameter and returns if all have same value
params:
s (list):
list of values of class
return:
(boolean):
True if pure else false
"""
return len(set(s)) == 1
def recursive_split(x, y, attr_name):
"""takes examples and values of class as parameter and returns best split of the data
params:
x (list of list):
list of examples having various attributes
y (list)
list of class values of the examples
attr_name (list of string)
list of names of the attributes
return:
(dictionary):
key is the best split with max information gain and values is either a class or a dictionary (decision tree)
"""
# If there could be no split, just return the original set
if is_pure(y) or len(y) == 0:
return str(y[0])
# We get attribute that gives the highest mutual information
gain = np.array([information_gain(y, x_attr) for x_attr in x.T])
selected_attr = np.argmax(gain)
# If there's no gain at all, nothing has to be done, just return the original set
if np.all(gain < 1e-6):
return y
# We split using the selected attribute
sets = partition(x[:, selected_attr])
res = {}
for k, v in sets.items():
# create subsets of data based on the split
y_subset = y.take(v, axis=0)
x_subset = x.take(v, axis=0)
x_subset = np.delete(x_subset, selected_attr, 1)
attr_name_subset = attr_name[:selected_attr] + attr_name[selected_attr+1:]
#recurse on subset of data left
res[attr_name[selected_attr] + " = " + k ] = recursive_split(x_subset, y_subset, attr_name_subset)
return res
def pruneLeaves(obj):
"""takes decision tree as parameter and returns a pruned tree based on chi square
params:
obj (dict):
obj is a decision tree encoded in the form of decision tree
return:
obj (dict):
obj is decision tree with pruned leaves
"""
isLeaf = True
parent = None
for key in obj:
if isinstance(obj[key], dict):
isLeaf = False
parent = key
break
if isLeaf and obj.keys()[0].split(' ')[0] not in satisfied_attributes:
global pruned
pruned = True
return 'pruned'
if not isLeaf:
if pruneLeaves(obj[parent]):
obj[parent] = None
return obj
#read examples from the csv file
data = np.loadtxt(open(file_name, "rb"), delimiter=",", dtype='string', converters = {3: lambda s: s.strip()})
#get first name for the attribute name
attr_name = data.take(0,0)[:-1].tolist()
#get last column for class attribute value
y = data[...,-1][1:]
#get rest of the data for the examples
X = data[...,:-1]
X = np.delete(X,0,0)
#call recursive_split to train the decision tree
tree = recursive_split(X, y, attr_name)
satisfied_attributes = []
for i in range(10):
contengency = pd.crosstab(X.T[i], y)
c, p, dof, expected = chi2_contingency(contengency)
if c > chi2.isf(q=alpha, df=dof):
satisfied_attributes.append(attr_name[i])
print '\nDecision tree before pruning-\n'
pprint(tree)
print '\nDecision tree after pruning-\n'
pruned = True
while pruned:
#keep pruning till leaf nodes can be pruned or till whole tree has been pruned
pruned = False
tree = pruneLeaves(tree)
if tree == 'pruned':
break
pprint(tree)