-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecision_tree_withmd.py
executable file
·288 lines (206 loc) · 9.44 KB
/
decision_tree_withmd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import numpy as np
import matplotlib.pyplot as plt
from public_tests import *
from utils import *
%matplotlib inline
X_train = np.array([[1,1,1],[1,0,1],[1,0,0],[1,0,0],[1,1,1],[0,1,1],[0,0,0],[1,0,1],[0,1,0],[1,0,0]])
y_train = np.array([1,1,0,0,1,0,0,1,1,0])
print("First few elements of X_train:\n", X_train[:5])
print("Type of X_train:",type(X_train))
print("First few elements of y_train:", y_train[:5])
print("Type of y_train:",type(y_train))
print ('The shape of X_train is:', X_train.shape)
print ('The shape of y_train is: ', y_train.shape)
print ('Number of training examples (m):', len(X_train))
# UNQ_C1
# GRADED FUNCTION: compute_entropy
def compute_entropy(y):
"""
Computes the entropy for
Args:
y (ndarray): Numpy array indicating whether each example at a node is
edible (`1`) or poisonous (`0`)
Returns:
entropy (float): Entropy at that node
"""
# You need to return the following variables correctly
entropy = 0.
### START CODE HERE ###
p1 = np.mean(y)
if p1 == 0 or p1 == 1:
return 0
else:
# Compute the entropy
entropy = -p1 * np.log2(p1) - (1 - p1) * np.log2(1 - p1)
### END CODE HERE ###
return entropy
# Compute entropy at the root node (i.e. with all examples)
# Since we have 5 edible and 5 non-edible mushrooms, the entropy should be 1"
print("Entropy at root node: ", compute_entropy(y_train))
# UNIT TESTS
compute_entropy_test(compute_entropy)
# UNQ_C2
# GRADED FUNCTION: split_dataset
def split_dataset(X, node_indices, feature):
"""
Splits the data at the given node into
left and right branches
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
node_indices (list): List containing the active indices. I.e, the samples being considered at this step.
feature (int): Index of feature to split on
Returns:
left_indices (list): Indices with feature value == 1
right_indices (list): Indices with feature value == 0
"""
# You need to return the following variables correctly
left_indices = []
right_indices = []
### START CODE HERE ###
for index in node_indices:
if X[index, feature] == 1:
left_indices.append(index)
else:
right_indices.append(index)
### END CODE HERE ###
return left_indices, right_indices
# Case 1
root_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# Feel free to play around with these variables
# The dataset only has three features, so this value can be 0 (Brown Cap), 1 (Tapering Stalk Shape) or 2 (Solitary)
feature = 0
left_indices, right_indices = split_dataset(X_train, root_indices, feature)
print("CASE 1:")
print("Left indices: ", left_indices)
print("Right indices: ", right_indices)
# Visualize the split
generate_split_viz(root_indices, left_indices, right_indices, feature)
print()
# Case 2
root_indices_subset = [0, 2, 4, 6, 8]
left_indices, right_indices = split_dataset(X_train, root_indices_subset, feature)
print("CASE 2:")
print("Left indices: ", left_indices)
print("Right indices: ", right_indices)
# Visualize the split
generate_split_viz(root_indices_subset, left_indices, right_indices, feature)
# UNIT TESTS
split_dataset_test(split_dataset)
# UNQ_C3
# GRADED FUNCTION: compute_information_gain
# UNQ_C3
# GRADED FUNCTION: compute_information_gain
def compute_information_gain(X, y, node_indices, feature):
"""
Compute the information of splitting the node on a given feature
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
feature (int): Index of feature to split on
Returns:
cost (float): Cost computed
"""
# Split dataset
left_indices, right_indices = split_dataset(X, node_indices, feature)
# Some useful variables
X_node, y_node = X[node_indices], y[node_indices]
X_left, y_left = X[left_indices], y[left_indices]
X_right, y_right = X[right_indices], y[right_indices]
# You need to return the following variables correctly
information_gain = 0
### START CODE HERE ###
# Calculate the entropy before the split
entropy_before = compute_entropy(y[node_indices])
print(f"Entropy before split: {entropy_before}")
# Split the dataset
left_indices, right_indices = split_dataset(X, node_indices, feature)
print(f"Left indices: {left_indices}, Right indices: {right_indices}")
# Calculate the entropy after the split
entropy_left = compute_entropy(y[left_indices]) if left_indices else 0
entropy_right = compute_entropy(y[right_indices]) if right_indices else 0
print(f"Entropy left: {entropy_left}, Entropy right: {entropy_right}")
# Calculate the weighted average of the entropy after the split
total_samples = len(node_indices)
weighted_entropy_after = (len(left_indices) / total_samples) * entropy_left + (
len(right_indices) / total_samples) * entropy_right
print(f"Weighted entropy after split: {weighted_entropy_after}")
# Calculate the information gain
information_gain = entropy_before - weighted_entropy_after
print(f"Information gain: {information_gain}")
### END CODE HERE ###
return information_gain
info_gain0 = compute_information_gain(X_train, y_train, root_indices, feature=0)
print("Information Gain from splitting the root on brown cap: ", info_gain0)
info_gain1 = compute_information_gain(X_train, y_train, root_indices, feature=1)
print("Information Gain from splitting the root on tapering stalk shape: ", info_gain1)
info_gain2 = compute_information_gain(X_train, y_train, root_indices, feature=2)
print("Information Gain from splitting the root on solitary: ", info_gain2)
# UNIT TESTS
compute_information_gain_test(compute_information_gain)
# UNQ_C4
# GRADED FUNCTION: get_best_split
def get_best_split(X, y, node_indices):
"""
Returns the optimal feature and threshold value
to split the node data
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
Returns:
best_feature (int): The index of the best feature to split
"""
# Some useful variables
num_features = X.shape[1]
# You need to return the following variables correctly
best_feature = -1
### START CODE HERE ###
# Check if the target variable is pure
if len(np.unique(y[node_indices])) == 1:
return -1
best_information_gain = -1
for feature in range(num_features):
information_gain = compute_information_gain(X, y, node_indices, feature)
if information_gain > best_information_gain:
best_information_gain = information_gain
best_feature = feature
### END CODE HERE ###
### END CODE HERE ##
return best_feature
best_feature = get_best_split(X_train, y_train, root_indices)
print("Best feature to split on: %d" % best_feature)
# UNIT TESTS
get_best_split_test(get_best_split)
# Not graded
tree = []
def build_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth):
"""
Build a tree using the recursive algorithm that split the dataset into 2 subgroups at each node.
This function just prints the tree.
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
branch_name (string): Name of the branch. ['Root', 'Left', 'Right']
max_depth (int): Max depth of the resulting tree.
current_depth (int): Current depth. Parameter used during recursive call.
"""
# Maximum depth reached - stop splitting
if current_depth == max_depth:
formatting = " " * current_depth + "-" * current_depth
print(formatting, "%s leaf node with indices" % branch_name, node_indices)
return
# Otherwise, get best split and split the data
# Get the best feature and threshold at this node
best_feature = get_best_split(X, y, node_indices)
formatting = "-" * current_depth
print("%s Depth %d, %s: Split on feature: %d" % (formatting, current_depth, branch_name, best_feature))
# Split the dataset at the best feature
left_indices, right_indices = split_dataset(X, node_indices, best_feature)
tree.append((left_indices, right_indices, best_feature))
# continue splitting the left and the right child. Increment current depth
build_tree_recursive(X, y, left_indices, "Left", max_depth, current_depth + 1)
build_tree_recursive(X, y, right_indices, "Right", max_depth, current_depth + 1)
build_tree_recursive(X_train, y_train, root_indices, "Root", max_depth=2, current_depth=0)
generate_tree_viz(root_indices, y_train, tree)