-
Notifications
You must be signed in to change notification settings - Fork 8
/
week3.py
95 lines (72 loc) · 3.02 KB
/
week3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from dm_tools import data_prep
import pydot
from io import StringIO
from sklearn.tree import export_graphviz
# helper functions from the end. Put these two in dm_tools at the end of tutorial. Import it with this method instead
# from dm_tools import analyse_feature_importance, visualize_decision_tree
def analyse_feature_importance(dm_model, feature_names, n_to_display=20):
# grab feature importances from the model
importances = dm_model.feature_importances_
# sort them out in descending order
indices = np.argsort(importances)
indices = np.flip(indices, axis=0)
# limit to 20 features, you can leave this out to print out everything
indices = indices[:n_to_display]
for i in indices:
print(feature_names[i], ':', importances[i])
def visualize_decision_tree(dm_model, feature_names, save_name):
dotfile = StringIO()
export_graphviz(dm_model, out_file=dotfile, feature_names=feature_names)
graph = pydot.graph_from_dot_data(dotfile.getvalue())
graph[0].write_png(save_name) # saved in the following file
# preprocessing step
df = data_prep()
# train test split
y = df['TargetB']
X = df.drop(['TargetB'], axis=1)
X_mat = X.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.5, random_state=42)
# simple decision tree training
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
##########################################
# grid search CV #1
params = {'criterion': ['gini', 'entropy'],
'max_depth': range(3, 10),
'min_samples_leaf': range(20, 60, 10)}
cv = GridSearchCV(param_grid=params, estimator=DecisionTreeClassifier(), cv=10)
cv.fit(X_train, y_train)
print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))
# test the best model
y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))
# print parameters of the best model
print(cv.best_params_)
##########################################
# grid search CV #2
params = {'criterion': ['gini'],
'max_depth': range(2, 5),
'min_samples_leaf': range(40, 61, 5)}
cv = GridSearchCV(param_grid=params, estimator=DecisionTreeClassifier(), cv=10)
cv.fit(X_train, y_train)
print("Train accuracy:", cv.score(X_train, y_train))
print("Test accuracy:", cv.score(X_test, y_test))
# test the best model
y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred))
# print parameters of the best model
print(cv.best_params_)
##########################################
# feature importances on best CV model
analyse_feature_importance(cv.best_estimator_, X.columns, 20)
visualize_decision_tree(cv.best_estimator_, X.columns, "dm_best_cv.png")