-
Notifications
You must be signed in to change notification settings - Fork 0
Traditional Machine Learning Classification Code
Hugh Sun edited this page Feb 2, 2021
·
1 revision
These code blocks define functions that given a dataset and a metric (e.g. precision/recall), perform standard machine learning algorithm (e.g. SVM, Logistic Regression, Random Forest) on the data and report back the best model (hyperparameters tuned via cross validation)
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
def MLClassification(scoring_metric, scoring_metric_eval, X_train, y_train, X_test, y_test):
my_seed = 123
LR = LogisticRegression()
KNN = KNeighborsClassifier()
RF = RandomForestClassifier()
SVM = SVC()
parameters = {
'penalty':('l1','l2'),
'C':(0.005, 0.01, 0.05, 0.1, 0.2 ,1, 2, 5)
}
Grid_LR = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv = 5, scoring = scoring_metric)
Grid_LR.fit(X_train, y_train)
best_LR_model = Grid_LR.best_estimator_
# KNN grid search
parameters = {
'n_neighbors': [3,5,7,9,11,21],
'p': [1,2,3], # p is the l_p norm metric choice, 2 is euclidean
'weights':['uniform','distance']
}
Grid_KNN = GridSearchCV(KNeighborsClassifier(),parameters, cv = 5, scoring = scoring_metric)
Grid_KNN.fit(X_train, y_train)
best_KNN_model = Grid_KNN.best_estimator_
# RF Grid Search
parameters = {
'n_estimators' : [40,60,80,100,200,400],
'criterion': ['gini', 'entropy'],
'random_state': [my_seed]
}
Grid_RF = GridSearchCV(RandomForestClassifier(), parameters, cv = 5, scoring = scoring_metric)
Grid_RF.fit(X_train, y_train)
best_RF_model = Grid_RF.best_estimator_
# SVM Grid Search
parameters = {
'kernel':('rbf', 'poly'),
'degree':(3, 5, 10), # degree of polynomial
'gamma':('scale', 0.01, 0.1), # Kernel coefficient
'C':(0.01, 0.1, 1, 2, 5),
'probability':[True] # want to plot it on ROC later, so turn this on.
}
Grid_SVM = GridSearchCV(SVC(),parameters, cv = 5, scoring = scoring_metric)
Grid_SVM.fit(X_train, y_train)
best_SVM_model = Grid_SVM.best_estimator_
model_list = [best_LR_model, best_KNN_model, best_RF_model, best_SVM_model]
model_names = ['Logistic Regression', 'KNN', 'Random Forest', 'SVM']
for i, model in enumerate([best_LR_model, best_KNN_model, best_RF_model, best_SVM_model]):
pred = model.predict(X_test)
# calling a method by its string representation in python:
score = getattr(metrics, scoring_metric_eval)(y_test, pred)
print(f'{scoring_metric} of {model_names[i]} is {score}')
print()
MLClassification('f1', 'f1_score', X_train, y_train, X_test, y_test)
MLClassification('recall', 'recall_score', X_train, y_train, X_test, y_test)
MLClassification('average_precision', 'average_precision_score', X_train, y_train, X_test, y_test)
How to draw ROC curve:
model_list = [best_LR_model, best_KNN_model, best_RF_model, best_SVM_model]
model_names = ['Logistic Regression', 'KNN', 'Random Forest', 'SVM']
plt.figure(1)
plt.plot([0,1],[0,1],'k--')
for i, model in enumerate(model_list):
pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, pred)
# plot this model's ROC curve.
plt.plot(fpr, tpr, label = model_names[i])
# calling a method by its string representation in python:
score = metrics.roc_auc_score(y_test, pred)
print(f'{scoring_metric} of {model_names[i]} is {score}')
print()
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve of various models')
plt.legend(loc = 'best')
plt.show()