-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenres_sl_functions.py
275 lines (220 loc) · 11.4 KB
/
genres_sl_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import os
import time
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# my import functions
import constants as const
import plot_function
def makedir(dir_path):
# create a new directory
if not os.path.exists(dir_path):
os.makedirs(dir_path)
def load_data(data_path):
# read file and drop unnecessary column
raw_dataset = pd.read_csv(data_path)
print("\nRaw Dataset Keys:\n\033[92m{}\033[0m".format(raw_dataset.keys()))
df = raw_dataset.drop(["filename"], axis=1)
print("\nData Shape: \033[92m{}\033[0m".format(df.shape))
# encode genre label as integer values
# i.e.: blues = 0, ..., rock = 9
encoder = preprocessing.OrdinalEncoder()
df["genre"] = encoder.fit_transform(df[["genre"]])
# split df into x and y
label_column = "genre"
X = df.loc[:, df.columns != label_column]
y = df.loc[:, label_column]
X_columns = X.columns
resized_data = preprocessing.MinMaxScaler()
np_scaled = resized_data.fit_transform(X)
X = pd.DataFrame(np_scaled, columns=X_columns)
return X, y, df
def prepare_datasets(X, y, test_size):
# create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
return X_train, X_test, y_train, y_test
def get_classification_model():
# models dictionary
models = {"NN": [], "RF": [], "KNN": [], "SVM": []}
# Neural Network
nn_model = MLPClassifier(solver="adam", alpha=1e-5, hidden_layer_sizes=(256, 128, 128, 64, 64, 32), random_state=10,
activation="relu", learning_rate="adaptive", early_stopping=False, verbose=False,
learning_rate_init=0.001, max_iter=500)
models.update({"NN": nn_model})
# Random forest
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=15)
models.update({"RF": rf_model})
# k-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5, weights="distance", metric="euclidean")
models.update({"KNN": knn_model})
# Support Vector Machine
svc_model = SVC(C=150, kernel="rbf", probability=True, random_state=10)
models.update({"SVM": svc_model})
return models
def compute_detailed_evaluation_metrics(model, model_name, X_test, y_test):
# Predict the target vector
y_predict = model.predict(X_test)
# compute detailed report
clf_report = classification_report(y_test, y_predict, target_names=const.GENRES_LIST, digits=2, output_dict=True)
# update so in df is shown in the same way as standard print
clf_report.update({"accuracy": {"precision": None, "recall": None, "f1-score": clf_report["accuracy"],
"support": clf_report.get("macro avg")["support"]}})
df = pd.DataFrame(clf_report).transpose()
# save the report into file
makedir(const.DATA_FOLDER + "/" + const.CLF_REPORT_PATH)
df.to_csv(const.DATA_FOLDER + "/" + const.CLF_REPORT_PATH + "/" + model_name + "_classification_report.csv",
index=True, float_format="%.5f")
return df
# Compute a simplified version of the classifier metrics report
def compute_simple_clf_metrics(model, X_test, y_test, execution_time):
# Predict the target vector
y_predict = model.predict(X_test)
# my dictionary
dictionary = {}
# metrics computation
clf_accuracy = metrics.accuracy_score(y_test, y_predict) * 100
clf_rmse = metrics.mean_squared_error(y_test, y_predict, squared=False)
clf_f1_score = metrics.f1_score(y_test, y_predict, average="weighted")
# insert value into dictionary
dictionary["ACCURACY"] = clf_accuracy
dictionary["RMSE"] = clf_rmse
dictionary["F1_SCORE"] = clf_f1_score
dictionary["EXECUTION_TIME"] = execution_time
return dictionary
def prediction_comparison(model, X_test, y_test):
# Predict the target vector
y_predict = model.predict(X_test)
# Genres
genres = {i: const.GENRES_LIST[i] for i in range(0, len(const.GENRES_LIST))}
clf_data = pd.DataFrame(columns=["real_genre_num", "predict_genre_num",
"real_genre_label", "predict_genre_label"])
clf_data["real_genre_num"] = y_test.astype(int)
clf_data["predict_genre_num"] = y_predict.astype(int)
# compare real values with predicted values
comparison_column = np.where(clf_data["real_genre_num"] == clf_data["predict_genre_num"], True, False)
clf_data["check"] = comparison_column
clf_data["real_genre_label"] = clf_data["real_genre_num"].replace(genres)
clf_data["predict_genre_label"] = clf_data["predict_genre_num"].replace(genres)
input_data = pd.DataFrame()
input_data[["Genre", "Real_Value"]] = \
clf_data[["real_genre_label", "predict_genre_label"]].groupby(["real_genre_label"], as_index=False).count()
input_data[["Genre", "Predict_Value"]] = \
clf_data[["real_genre_label", "predict_genre_label"]].groupby(["predict_genre_label"], as_index=False).count()
return input_data
def model_evaluation(models, X_train, y_train, X_test, y_test,
show_confusion_matrix=True, show_roc_curve=True,
show_compare_prediction_by_genre=True, show_simple_compare=True):
# dictionary for gathering the summary of metrics computations on classifiers
merge_clf_summary_results = {}
# evaluation of each classifier
for key, value in models.items():
# NN, KNN, RF and SVM
model_name = key
# computed model
model_type = value
# For computation of execution time
start_execution_time = time.time()
if show_confusion_matrix:
# plotting confusion matrix
plot_function.plot_confusion_matrix(model=model_type,
model_name=model_name,
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
show_on_screen=True,
store_in_folder=False)
if model_name == "SVM":
y_score = model_type.fit(X_train, y_train).decision_function(X_test)
else: # if model_name == "NN, KNN, RF, SVM"
model_type.fit(X_train, y_train)
y_score = model_type.predict_proba(X_test)
if show_roc_curve:
# Plotting the roc curve
plot_function.plot_roc(y_test=y_test,
y_score=y_score,
operation_name=model_name,
genres_list=const.GENRES_LIST,
type_of_learning="SL",
show_on_screen=True,
store_in_folder=False)
if show_compare_prediction_by_genre:
# Predict the target vector
y_predict = model_type.predict(X_test)
# plot histogram
plot_function.plot_comparison_of_predictions_by_genre(y_test=y_test,
y_pred=y_predict,
genres_list=const.GENRES_LIST,
model_name=model_name,
show_on_screen=True,
store_in_folder=False)
if show_simple_compare:
input_data = prediction_comparison(model=model_type, X_test=X_test, y_test=y_test)
# evaluation actual/prediction
plot_function.plot_predictions_evaluation(input_data=input_data,
model_name=model_name,
genres_list=const.GENRES_LIST,
show_on_screen=True,
store_in_folder=False)
# evaluation metrics computation
clf_report = compute_detailed_evaluation_metrics(model=model_type, model_name=model_name, X_test=X_test,
y_test=y_test)
# plot classification report
plot_function.plot_classification_report(clf_report=clf_report,
model_name=model_name,
show_on_screen=True,
store_in_folder=False)
# Compute execution time of each classifier
execution_time = time.time() - start_execution_time
# single summary of metrics per classifier
single_clf_metrics = compute_simple_clf_metrics(model=model_type, X_test=X_test, y_test=y_test,
execution_time=execution_time)
# merge results together and add columns with classifier name
merge_clf_summary_results[model_name] = single_clf_metrics
# resulted dataframe with summary metrics per classifier
clf_summary_report = pd.DataFrame(merge_clf_summary_results)
# save the report into file
makedir(const.DATA_FOLDER + "/" + const.CLF_REPORT_PATH)
clf_summary_report.to_csv(const.DATA_FOLDER + "/" + const.CLF_REPORT_PATH + "/CLFs_summary_report.csv",
index=True, float_format="%.2f")
def classification_and_evaluation(data_path):
# load data
X, y, df = load_data(data_path=data_path)
print("\nData:\n\033[92m{}\033[0m".format(df))
print("\nX (my data):\n\033[92m{}\033[0m".format(X))
print("\ny (labels):\n\033[92m{}\033[0m".format(y))
# Plot correlation matrix
plot_function.plot_correlation_matrix(input_data=X,
show_on_screen=True,
store_in_folder=False)
# create train/test split
X_train, X_test, y_train, y_test = prepare_datasets(X=X, y=y, test_size=0.30)
print("\nSplit data into Train and Test:")
print("- Train set has \033[92m{}\033[0m"
" records out of \033[92m{}\033[0m"
" which is \033[92m{}%\033[0m".format(X_train.shape[0], len(df), round(X_train.shape[0] / len(df) * 100)))
print("- Test set has \033[92m{}\033[0m"
" records out of \033[92m{}\033[0m"
" which is \033[92m{}%\033[0m\n".format(X_test.shape[0], len(df), round(X_test.shape[0] / len(df) * 100)))
# models and classification
clf_models = get_classification_model()
# evaluation
model_evaluation(models=clf_models,
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
show_confusion_matrix=True,
show_roc_curve=True,
show_compare_prediction_by_genre=True,
show_simple_compare=True)
# # used for testing
# if __name__ == '__main__':
# classification_and_evaluation(data_path=const.DATA_PATH)