Skip to content

Traditional Machine Learning Classification Code

Hugh Sun edited this page Feb 2, 2021 · 1 revision

These code blocks define functions that given a dataset and a metric (e.g. precision/recall), perform standard machine learning algorithm (e.g. SVM, Logistic Regression, Random Forest) on the data and report back the best model (hyperparameters tuned via cross validation)

from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

def MLClassification(scoring_metric, scoring_metric_eval, X_train, y_train, X_test, y_test):

  my_seed = 123

  LR = LogisticRegression()
  KNN = KNeighborsClassifier()
  RF = RandomForestClassifier()
  SVM = SVC()
  
  parameters = {
      'penalty':('l1','l2'),
      'C':(0.005, 0.01, 0.05, 0.1, 0.2 ,1, 2, 5)
  }
  Grid_LR = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv = 5, scoring = scoring_metric)
  Grid_LR.fit(X_train, y_train)
  best_LR_model = Grid_LR.best_estimator_

  # KNN grid search
  parameters = {
      'n_neighbors': [3,5,7,9,11,21],
      'p': [1,2,3], # p is the l_p norm metric choice, 2 is euclidean
      'weights':['uniform','distance']
  }
  Grid_KNN = GridSearchCV(KNeighborsClassifier(),parameters, cv = 5, scoring = scoring_metric)
  Grid_KNN.fit(X_train, y_train)
  best_KNN_model = Grid_KNN.best_estimator_

  # RF Grid Search
  parameters = {
      'n_estimators' : [40,60,80,100,200,400],
      'criterion': ['gini', 'entropy'],
      'random_state': [my_seed]
  }
  Grid_RF = GridSearchCV(RandomForestClassifier(), parameters, cv = 5, scoring = scoring_metric)
  Grid_RF.fit(X_train, y_train)
  best_RF_model = Grid_RF.best_estimator_

  # SVM Grid Search
  parameters = {
      'kernel':('rbf', 'poly'),
      'degree':(3, 5, 10), # degree of polynomial
      'gamma':('scale', 0.01, 0.1), # Kernel coefficient
      'C':(0.01, 0.1, 1, 2, 5),
      'probability':[True] # want to plot it on ROC later, so turn this on.
  }
  Grid_SVM = GridSearchCV(SVC(),parameters, cv = 5, scoring = scoring_metric) 
  Grid_SVM.fit(X_train, y_train)
  best_SVM_model = Grid_SVM.best_estimator_

  model_list =  [best_LR_model, best_KNN_model, best_RF_model, best_SVM_model]
  model_names = ['Logistic Regression', 'KNN', 'Random Forest', 'SVM']
  
  for i, model in enumerate([best_LR_model, best_KNN_model, best_RF_model, best_SVM_model]):
    pred = model.predict(X_test)
    # calling a method by its string representation in python:
    score = getattr(metrics, scoring_metric_eval)(y_test, pred)
    print(f'{scoring_metric} of {model_names[i]} is {score}')
    print()
MLClassification('f1', 'f1_score', X_train, y_train, X_test, y_test)
MLClassification('recall', 'recall_score', X_train, y_train, X_test, y_test)
MLClassification('average_precision', 'average_precision_score', X_train, y_train, X_test, y_test)

How to draw ROC curve:

model_list =  [best_LR_model, best_KNN_model, best_RF_model, best_SVM_model]
model_names = ['Logistic Regression', 'KNN', 'Random Forest', 'SVM']

plt.figure(1)
plt.plot([0,1],[0,1],'k--')
for i, model in enumerate(model_list):

  pred = model.predict_proba(X_test)[:, 1]
  fpr, tpr, _ = metrics.roc_curve(y_test, pred)

  # plot this model's ROC curve.
  plt.plot(fpr, tpr, label = model_names[i])
  # calling a method by its string representation in python:
  score = metrics.roc_auc_score(y_test, pred)
  print(f'{scoring_metric} of {model_names[i]} is {score}')
  print()

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve of various models')
plt.legend(loc = 'best')
plt.show()
Clone this wiki locally