-
Notifications
You must be signed in to change notification settings - Fork 0
/
Attrition_Prediction_Algorithm.py
143 lines (97 loc) · 4.6 KB
/
Attrition_Prediction_Algorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
"""import warnings filter"""
from warnings import simplefilter
"""ignore all future warnings"""
simplefilter(action='ignore', category=FutureWarning)
"""Read csv file"""
Leave_Dataset = pd.read_csv('Combined-Employees-Dataset.csv', nrows=1).columns
Leave_Dataset = pd.read_csv('Combined-Employees-Dataset.csv', usecols=Leave_Dataset[1:])
print(Leave_Dataset.tail(5))
"""Target data Isolated"""
YtargetData = Leave_Dataset['left']
"""these columns not needed now"""
to_drop = ['Emp ID', 'salary', 'left']
leave_feat_space = Leave_Dataset.drop(to_drop, axis=1)
"""Pull out features for future use"""
features = leave_feat_space.columns
"""Convert label features to integers"""
le_sales = preprocessing.LabelEncoder()
le_sales.fit(leave_feat_space["dept"])
leave_feat_space["dept"] = le_sales.transform(leave_feat_space.loc[:,('dept')])
"""Transform the whole feature space into a matrix"""
X = leave_feat_space.values
"""Standardize all features"""
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)
print("Feature space holds %d observations and %d features" % X.shape)
print("Unique target labels:", np.unique(YtargetData))
"""Prediction Function"""
def prediction_func(X, y, clf_class, method, **kwargs):
from sklearn.model_selection import cross_val_predict
"""Initialize a classifier with key word arguments"""
clf = clf_class(**kwargs)
predicted = cross_val_predict(clf, X, y, cv=3, method=method)
return predicted
def accuracy_measure(y, predicted):
"""NumPy interprets True and False as 1. and 0."""
return metrics.accuracy_score(y, predicted)
print("Support Vector Machines:")
print("%.3f" % accuracy_measure(YtargetData, prediction_func(X, YtargetData, SVC, method='predict')))
print("Random Forest:")
print("%.3f" % accuracy_measure(YtargetData, prediction_func(X, YtargetData, RF, method='predict')))
print("K-Nearest-Neighbors:")
print("%.3f" % accuracy_measure(YtargetData, prediction_func(X, YtargetData, KNN, method='predict')))
"""Target classes"""
y = np.array(YtargetData)
class_names = np.unique(y)
"""Confusion matrices calculation"""
confusion_matrices = [
("Support Vector Machines", confusion_matrix(y, prediction_func(X, y, SVC, method='predict'))),
("Random Forest", confusion_matrix(y, prediction_func(X, y, RF, method='predict'))),
("K-Nearest-Neighbors", confusion_matrix(y, prediction_func(X, y, KNN, method='predict'))),
]
#
"""Print confusion matrix values"""
print confusion_matrices
"""Draw confusion matrices for prediction algorithms evaluation"""
for cfm in confusion_matrices:
ax = plt.axes()
ax.set_title(cfm[0])
df_cm = pd.DataFrame(cfm[1], index=["Real 0", "Real 1"], columns=["Pred 0", "Pred 1"])
plt.figure(figsize=(6, 5))
sb.heatmap(df_cm, annot=True, ax=ax, square=True, fmt="d", linewidths=.5)
plt.show()
"""Use 10 estimators so predictions are all multiples of 0.1"""
pred_prob = prediction_func(X, y, RF, n_estimators=10, method='predict_proba',)
pred_leave = pred_prob[:, 1]
is_leave = y == 1
"""Number of times a predicted probability is assigned to an observation"""
counts = pd.value_counts(pred_leave)
"""probabilities of truth calculation"""
true_prob = {}
for prob in counts.index:
true_prob[prob] = np.mean(is_leave[pred_leave == prob])
true_prob = pd.Series(true_prob)
"""Pandas concatenate function"""
counts = pd.concat([counts, true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
print(counts)
"""create a dataframe containing prob values"""
pred_prob_df = pd.DataFrame(pred_prob)
pred_prob_df.columns = ['prob_not_leaving', 'prob_leaving']
"""merge dataframes to get the name of employees"""
all_employees_pred_prob_df = pd.concat([Leave_Dataset, pred_prob_df], axis=1)
"""filter out employees still in the company and having a good evaluation"""
good_employees_still_working_df = all_employees_pred_prob_df[(all_employees_pred_prob_df["left"] == 0) & (all_employees_pred_prob_df["last_evaluation"] >= 0.7)]
good_employees_still_working_df.sort_values(by='prob_leaving', ascending=False, inplace=True)
"""Save into csv file"""
good_employees_still_working_df.to_csv("employees_leaving_prob.csv")