diff --git a/labs/2019/lab3_lr_svm_eval_sol.ipynb b/labs/2019/lab3_lr_svm_eval_sol.ipynb
new file mode 100644
index 0000000..5aa85fd
--- /dev/null
+++ b/labs/2019/lab3_lr_svm_eval_sol.ipynb
@@ -0,0 +1,1525 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Lab 3: Logistic Regression, Support Vector Machines, and Evaluation\n",
+ "\n",
+ "\n",
+ "In this lab we'll get some hands on experience with two more classifiers we've seen in class\n",
+ "- Logitic Regression\n",
+ "- Support Vector Machines\n",
+ "\n",
+ "We will also explore evaluation metrics that we covered in class and understand how to calculate them."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Goals for this lab\n",
+ "\n",
+ "- Understand the practical implications for changing the parameters used in Logistic Regression and Support Vector Machines\n",
+ " \n",
+ "- Learn more about the evaluation metrics covered in class and learn how to calculate them (at different thresholds)\n",
+ " - accuracy\n",
+ " - precision\n",
+ " - recall\n",
+ " - AUC"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "### import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import sklearn.tree as tree\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.svm import LinearSVC\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import accuracy_score as accuracy\n",
+ "import graphviz # If you don't have this, install via pip/conda\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "from sklearn.metrics import precision_recall_curve\n",
+ "%matplotlib inline\n",
+ "\n",
+ "# exercise: what additional modules should you import?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Data\n",
+ "We'll continue to use the same data as in the previous lab.\n",
+ "\n",
+ "It is a subset of the data set from https://www.kaggle.com/new-york-state/nys-patient-characteristics-survey-pcs-2015\n",
+ "\n",
+ "The data has been downloaded, modified, and is in the github repo for the lab\n",
+ "\n",
+ "You should also try this with other data sets you have been provided for the homeworks."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "# Change this to wherever you're storing your data\n",
+ "datafile = '../data/nysmedicaldata.csv'\n",
+ "df = pd.read_csv(datafile)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
Region Served
\n",
+ "
Age Group
\n",
+ "
Sex
\n",
+ "
Transgender
\n",
+ "
Sexual Orientation
\n",
+ "
Hispanic Ethnicity
\n",
+ "
Race
\n",
+ "
Living Situation
\n",
+ "
Household Composition
\n",
+ "
Preferred Language
\n",
+ "
...
\n",
+ "
No Insurance
\n",
+ "
Unknown Insurance Coverage
\n",
+ "
Medicaid Insurance
\n",
+ "
Medicaid Managed Insurance
\n",
+ "
Medicare Insurance
\n",
+ "
Private Insurance
\n",
+ "
Child Health Plus Insurance
\n",
+ "
Other Insurance
\n",
+ "
Criminal Justice Status
\n",
+ "
Three Digit Residence Zip Code
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
HUDSON RIVER REGION
\n",
+ "
ADULT
\n",
+ "
MALE
\n",
+ "
NO, NOT TRANSGENDER
\n",
+ "
STRAIGHT OR HETEROSEXUAL
\n",
+ "
YES
\n",
+ "
OTHER
\n",
+ "
OTHER LIVING SITUATION
\n",
+ "
NOT APPLICABLE
\n",
+ "
ENGLISH
\n",
+ "
...
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
YES
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
YES
\n",
+ "
113
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
NEW YORK CITY REGION
\n",
+ "
ADULT
\n",
+ "
MALE
\n",
+ "
NO, NOT TRANSGENDER
\n",
+ "
STRAIGHT OR HETEROSEXUAL
\n",
+ "
NO, NOT HISPANIC/LATINO
\n",
+ "
WHITE ONLY
\n",
+ "
INSTITUTIONAL SETTING
\n",
+ "
NOT APPLICABLE
\n",
+ "
ENGLISH
\n",
+ "
...
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
YES
\n",
+ "
NO
\n",
+ "
UNKNOWN
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
UNKNOWN
\n",
+ "
YES
\n",
+ "
113
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
HUDSON RIVER REGION
\n",
+ "
ADULT
\n",
+ "
MALE
\n",
+ "
NO, NOT TRANSGENDER
\n",
+ "
STRAIGHT OR HETEROSEXUAL
\n",
+ "
NO, NOT HISPANIC/LATINO
\n",
+ "
WHITE ONLY
\n",
+ "
PRIVATE RESIDENCE
\n",
+ "
COHABITATES WITH OTHERS
\n",
+ "
ENGLISH
\n",
+ "
...
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
YES
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
YES
\n",
+ "
107
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
NEW YORK CITY REGION
\n",
+ "
ADULT
\n",
+ "
FEMALE
\n",
+ "
NO, NOT TRANSGENDER
\n",
+ "
STRAIGHT OR HETEROSEXUAL
\n",
+ "
NO, NOT HISPANIC/LATINO
\n",
+ "
OTHER
\n",
+ "
OTHER LIVING SITUATION
\n",
+ "
NOT APPLICABLE
\n",
+ "
ASIAN AND PACIFIC ISLAND
\n",
+ "
...
\n",
+ "
YES
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NOT APPLICABLE
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
YES
\n",
+ "
888
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
LONG ISLAND REGION
\n",
+ "
ADULT
\n",
+ "
MALE
\n",
+ "
NO, NOT TRANSGENDER
\n",
+ "
STRAIGHT OR HETEROSEXUAL
\n",
+ "
NO, NOT HISPANIC/LATINO
\n",
+ "
BLACK ONLY
\n",
+ "
OTHER LIVING SITUATION
\n",
+ "
NOT APPLICABLE
\n",
+ "
ENGLISH
\n",
+ "
...
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
YES
\n",
+ "
UNKNOWN
\n",
+ "
YES
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
NO
\n",
+ "
117
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 65 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Region Served Age Group Sex Transgender \\\n",
+ "0 HUDSON RIVER REGION ADULT MALE NO, NOT TRANSGENDER \n",
+ "1 NEW YORK CITY REGION ADULT MALE NO, NOT TRANSGENDER \n",
+ "2 HUDSON RIVER REGION ADULT MALE NO, NOT TRANSGENDER \n",
+ "3 NEW YORK CITY REGION ADULT FEMALE NO, NOT TRANSGENDER \n",
+ "4 LONG ISLAND REGION ADULT MALE NO, NOT TRANSGENDER \n",
+ "\n",
+ " Sexual Orientation Hispanic Ethnicity Race \\\n",
+ "0 STRAIGHT OR HETEROSEXUAL YES OTHER \n",
+ "1 STRAIGHT OR HETEROSEXUAL NO, NOT HISPANIC/LATINO WHITE ONLY \n",
+ "2 STRAIGHT OR HETEROSEXUAL NO, NOT HISPANIC/LATINO WHITE ONLY \n",
+ "3 STRAIGHT OR HETEROSEXUAL NO, NOT HISPANIC/LATINO OTHER \n",
+ "4 STRAIGHT OR HETEROSEXUAL NO, NOT HISPANIC/LATINO BLACK ONLY \n",
+ "\n",
+ " Living Situation Household Composition Preferred Language \\\n",
+ "0 OTHER LIVING SITUATION NOT APPLICABLE ENGLISH \n",
+ "1 INSTITUTIONAL SETTING NOT APPLICABLE ENGLISH \n",
+ "2 PRIVATE RESIDENCE COHABITATES WITH OTHERS ENGLISH \n",
+ "3 OTHER LIVING SITUATION NOT APPLICABLE ASIAN AND PACIFIC ISLAND \n",
+ "4 OTHER LIVING SITUATION NOT APPLICABLE ENGLISH \n",
+ "\n",
+ " ... No Insurance Unknown Insurance Coverage Medicaid Insurance \\\n",
+ "0 ... NO NO YES \n",
+ "1 ... NO NO YES \n",
+ "2 ... NO NO YES \n",
+ "3 ... YES NO NO \n",
+ "4 ... NO NO YES \n",
+ "\n",
+ " Medicaid Managed Insurance Medicare Insurance Private Insurance \\\n",
+ "0 NO NO NO \n",
+ "1 NO UNKNOWN NO \n",
+ "2 NO NO NO \n",
+ "3 NOT APPLICABLE NO NO \n",
+ "4 UNKNOWN YES NO \n",
+ "\n",
+ " Child Health Plus Insurance Other Insurance Criminal Justice Status \\\n",
+ "0 NO NO YES \n",
+ "1 NO UNKNOWN YES \n",
+ "2 NO NO YES \n",
+ "3 NO NO YES \n",
+ "4 NO NO NO \n",
+ "\n",
+ " Three Digit Residence Zip Code \n",
+ "0 113 \n",
+ "1 113 \n",
+ "2 107 \n",
+ "3 888 \n",
+ "4 117 \n",
+ "\n",
+ "[5 rows x 65 columns]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Some Quick Data Exploration\n",
+ "Before running any sort of model on your dataset, it's always a good idea to do some quick data exploration to get a sense of how your data looks like. Try to answer the following questions with some sort of plot/histogram/etc:\n",
+ "\n",
+ "1) What do the distributions of each feature look like?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "# Ex\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Using scikitlearn for classification\n",
+ "\n",
+ "sklearn is a very useful python packager for building machiune learning models. To build a model in sklearn, you need to have a matrix (or dataframe) with X and y columns. X is your set of features/predictors. y is a single column that is your label. We'll take the foll;owing steps:\n",
+ "\n",
+ "1. Select/create column as label/outcome (y)\n",
+ "2. Select/create columns as features (X)\n",
+ "3. Create Training Set\n",
+ "4. Create Validation Set\n",
+ "5. Build model on Training Set\n",
+ "6. Predict risk scores for the Validation Set\n",
+ "7. Calculate performance metric(s)\n",
+ "\n",
+ "## Some useful things to know in sklearn\n",
+ "\n",
+ "fit = train an algorithm\n",
+ "\n",
+ "predict_proba = predict a \"risk\" score for all possible classes for a given record (classification only)\n",
+ "\n",
+ "\n",
+ "## Important- never use .predict\n",
+ "There is also a function called \"predict\" which first runs predict_probs and then predicts a 1 if the score > 0.5 and 0 otherwise. *Never* use that function since 0.5 is a completely arbitrary threshold to call a prediction 1 vs 0.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Create label/outcome\n",
+ "One thing we can do with this dataset is to try to use the various feature columns to classify whether a person has High Blood Pressure. Let's create a column that is 1 if a person has High Blood Pressure and 0 otherwise"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "# code\n",
+ "df['HBP'] = np.where(df['High Blood Pressure']==\"YES\", 1, 0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Question: what percentage of people have High Blood Pressure?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.21600738092492217"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# code\n",
+ "df['HBP'].mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. create or select existing predictors/features\n",
+ "\n",
+ "For now, let's take a handful of existing columns to use.\n",
+ "\n",
+ "sklearn needs features to be numeric and not categorical so we'll have to turn our selected features to be binary (also known as dummy variables)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
Region Served
\n",
+ "
Age Group
\n",
+ "
Transgender
\n",
+ "
Sexual Orientation
\n",
+ "
Hispanic Ethnicity
\n",
+ "
Living Situation
\n",
+ "
Household Composition
\n",
+ "
Preferred Language
\n",
+ "
Veteran Status
\n",
+ "
Employment Status
\n",
+ "
...
\n",
+ "
Drug Substance Disorder_YES
\n",
+ "
Drug Substance Disorder_nan
\n",
+ "
Criminal Justice Status_NO
\n",
+ "
Criminal Justice Status_UNKNOWN
\n",
+ "
Criminal Justice Status_YES
\n",
+ "
Criminal Justice Status_nan
\n",
+ "
Private Insurance_NO
\n",
+ "
Private Insurance_UNKNOWN
\n",
+ "
Private Insurance_YES
\n",
+ "
Private Insurance_nan
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
HUDSON RIVER REGION
\n",
+ "
ADULT
\n",
+ "
NO, NOT TRANSGENDER
\n",
+ "
STRAIGHT OR HETEROSEXUAL
\n",
+ "
YES
\n",
+ "
OTHER LIVING SITUATION
\n",
+ "
NOT APPLICABLE
\n",
+ "
ENGLISH
\n",
+ "
NO
\n",
+ "
NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING ...
\n",
+ "
...
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
NEW YORK CITY REGION
\n",
+ "
ADULT
\n",
+ "
NO, NOT TRANSGENDER
\n",
+ "
STRAIGHT OR HETEROSEXUAL
\n",
+ "
NO, NOT HISPANIC/LATINO
\n",
+ "
INSTITUTIONAL SETTING
\n",
+ "
NOT APPLICABLE
\n",
+ "
ENGLISH
\n",
+ "
NO
\n",
+ "
NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING ...
\n",
+ "
...
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
HUDSON RIVER REGION
\n",
+ "
ADULT
\n",
+ "
NO, NOT TRANSGENDER
\n",
+ "
STRAIGHT OR HETEROSEXUAL
\n",
+ "
NO, NOT HISPANIC/LATINO
\n",
+ "
PRIVATE RESIDENCE
\n",
+ "
COHABITATES WITH OTHERS
\n",
+ "
ENGLISH
\n",
+ "
NO
\n",
+ "
NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING ...
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
NEW YORK CITY REGION
\n",
+ "
ADULT
\n",
+ "
NO, NOT TRANSGENDER
\n",
+ "
STRAIGHT OR HETEROSEXUAL
\n",
+ "
NO, NOT HISPANIC/LATINO
\n",
+ "
OTHER LIVING SITUATION
\n",
+ "
NOT APPLICABLE
\n",
+ "
ASIAN AND PACIFIC ISLAND
\n",
+ "
NO
\n",
+ "
NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING ...
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
LONG ISLAND REGION
\n",
+ "
ADULT
\n",
+ "
NO, NOT TRANSGENDER
\n",
+ "
STRAIGHT OR HETEROSEXUAL
\n",
+ "
NO, NOT HISPANIC/LATINO
\n",
+ "
OTHER LIVING SITUATION
\n",
+ "
NOT APPLICABLE
\n",
+ "
ENGLISH
\n",
+ "
NO
\n",
+ "
NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING ...
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 91 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Region Served Age Group Transgender \\\n",
+ "0 HUDSON RIVER REGION ADULT NO, NOT TRANSGENDER \n",
+ "1 NEW YORK CITY REGION ADULT NO, NOT TRANSGENDER \n",
+ "2 HUDSON RIVER REGION ADULT NO, NOT TRANSGENDER \n",
+ "3 NEW YORK CITY REGION ADULT NO, NOT TRANSGENDER \n",
+ "4 LONG ISLAND REGION ADULT NO, NOT TRANSGENDER \n",
+ "\n",
+ " Sexual Orientation Hispanic Ethnicity Living Situation \\\n",
+ "0 STRAIGHT OR HETEROSEXUAL YES OTHER LIVING SITUATION \n",
+ "1 STRAIGHT OR HETEROSEXUAL NO, NOT HISPANIC/LATINO INSTITUTIONAL SETTING \n",
+ "2 STRAIGHT OR HETEROSEXUAL NO, NOT HISPANIC/LATINO PRIVATE RESIDENCE \n",
+ "3 STRAIGHT OR HETEROSEXUAL NO, NOT HISPANIC/LATINO OTHER LIVING SITUATION \n",
+ "4 STRAIGHT OR HETEROSEXUAL NO, NOT HISPANIC/LATINO OTHER LIVING SITUATION \n",
+ "\n",
+ " Household Composition Preferred Language Veteran Status \\\n",
+ "0 NOT APPLICABLE ENGLISH NO \n",
+ "1 NOT APPLICABLE ENGLISH NO \n",
+ "2 COHABITATES WITH OTHERS ENGLISH NO \n",
+ "3 NOT APPLICABLE ASIAN AND PACIFIC ISLAND NO \n",
+ "4 NOT APPLICABLE ENGLISH NO \n",
+ "\n",
+ " Employment Status ... \\\n",
+ "0 NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING ... ... \n",
+ "1 NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING ... ... \n",
+ "2 NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING ... ... \n",
+ "3 NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING ... ... \n",
+ "4 NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING ... ... \n",
+ "\n",
+ " Drug Substance Disorder_YES Drug Substance Disorder_nan \\\n",
+ "0 1 0 \n",
+ "1 1 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 0 0 \n",
+ "\n",
+ " Criminal Justice Status_NO Criminal Justice Status_UNKNOWN \\\n",
+ "0 0 0 \n",
+ "1 0 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 1 0 \n",
+ "\n",
+ " Criminal Justice Status_YES Criminal Justice Status_nan \\\n",
+ "0 1 0 \n",
+ "1 1 0 \n",
+ "2 1 0 \n",
+ "3 1 0 \n",
+ "4 0 0 \n",
+ "\n",
+ " Private Insurance_NO Private Insurance_UNKNOWN Private Insurance_YES \\\n",
+ "0 1 0 0 \n",
+ "1 1 0 0 \n",
+ "2 1 0 0 \n",
+ "3 1 0 0 \n",
+ "4 1 0 0 \n",
+ "\n",
+ " Private Insurance_nan \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ "[5 rows x 91 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# code\n",
+ "cols_to_transform = [ 'Sex', 'Race', 'Obesity', 'Smokes', 'Alcohol Related Disorder','Drug Substance Disorder',\n",
+ " 'Criminal Justice Status','Private Insurance']\n",
+ "df = pd.get_dummies( df, dummy_na=True, columns = cols_to_transform )\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now let's define a vector of column names with only those dummy variables."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "# code\n",
+ "selected_features = ['Sex_FEMALE','Sex_MALE','Sex_UNKNOWN','Race_BLACK ONLY','Race_MULTI-RACIAL','Race_OTHER',\n",
+ " 'Race_UNKNOWN RACE','Race_WHITE ONLY', 'Obesity_YES', 'Obesity_NO', 'Obesity_nan', \n",
+ " 'Smokes_YES', 'Smokes_NO','Alcohol Related Disorder_NO','Alcohol Related Disorder_YES',\n",
+ " 'Alcohol Related Disorder_UNKNOWN','Criminal Justice Status_YES','Criminal Justice Status_NO',\n",
+ " 'Criminal Justice Status_UNKNOWN','Private Insurance_NO','Private Insurance_YES','Private Insurance_UNKNOWN']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Train/Test Splits\n",
+ "\n",
+ "Create a train/test set split using sklearn's [train_test_split](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) function. We'll use these train/test splits for evaluating all our classification models."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "# code\n",
+ "x = df[selected_features]\n",
+ "y = df['HBP']\n",
+ "test_size = 0.3 # you can adjust this\n",
+ "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true
+ },
+ "source": [
+ "# Logistic Regression\n",
+ "See the sklearn documentation on Logistic Regression to see its parameters. The one's we'll mostly be interested in are:\n",
+ "- penalty\n",
+ "- C"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Remember that when training a model, **you should only use the training data!** The test set is reserved exclusively for evaluating your model. Now let's use the classifier:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "# code\n",
+ "lr = LogisticRegression(random_state=0, solver='liblinear')\n",
+ "lr.fit(x_train, y_train)\n",
+ "pred_scores = lr.predict_proba(x_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[0.75929133 0.24070867]\n",
+ " [0.64513388 0.35486612]\n",
+ " [0.75818005 0.24181995]\n",
+ " ...\n",
+ " [0.64228939 0.35771061]\n",
+ " [0.82387763 0.17612237]\n",
+ " [0.75929133 0.24070867]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(pred_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Logistic Regression Tasks:\n",
+ "\n",
+ "The goal here is to explore different penalty parameters and different C values. You can also try modofyinfg other parameters to see their impact. How does accuracy change, using different thresholds, as you vary penalty and C values? You can write a nested for loop that loops over all the parameters and values and store the results in a data frame (similar to last lab)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true
+ },
+ "source": [
+ "Ref: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n",
+ "\n",
+ "You'll notice that LogisticRegression takes a ton of parameters. We'll play around with the \"penalty\" and \"C\" parameters.\n",
+ "If we set the penalty parameter to ['l2'](http://mathworld.wolfram.com/L2-Norm.html), sklearn's LogisticRegression model solves the following minimization problem:\n",
+ "\n",
+ "$$ \\min_{\\beta} ||\\beta||_2 + C \\sum_{i} \\log ( -y_i (X_i^T \\beta) +1)$$\n",
+ "\n",
+ "Similarly, if we set the penalty parameter to ['l1'](http://mathworld.wolfram.com/L2-Norm.html), LogisticRegression will solve the following minimization problem:\n",
+ "\n",
+ "$$\\min_{\\beta} ||\\beta||_1 + C \\sum_{i} \\log ( -y_i (X_i^T \\beta) +1)$$\n",
+ "\n",
+ "where $$||\\beta||_2 = \\sqrt { \\sum_{i} \\beta_i^2 }$$ and $$||\\beta||_1 = \\sum_{i} | \\beta_i | $$ \n",
+ "\n",
+ "Try running logistic regression with both L1 and L2 penalties and a mix of C values. Something like $10^{-2}, 10^{-1}, 1, 10, 10^2)$ is reasonable."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The true number of HBP is 541/2602 from the data, with percentage 20.79%\n",
+ "\n",
+ "(Threshold: 0.1), the total number of predicted HBP is 2539, the accuracy is 0.23\n",
+ "(Threshold: 0.2), the total number of predicted HBP is 1604, the accuracy is 0.48\n",
+ "(Threshold: 0.25), the total number of predicted HBP is 578, the accuracy is 0.70\n",
+ "(Threshold: 0.3), the total number of predicted HBP is 377, the accuracy is 0.74\n",
+ "(Threshold: 0.35), the total number of predicted HBP is 291, the accuracy is 0.76\n",
+ "(Threshold: 0.4), the total number of predicted HBP is 113, the accuracy is 0.78\n",
+ "(Threshold: 0.5), the total number of predicted HBP is 0, the accuracy is 0.79\n",
+ "(Threshold: 0.6), the total number of predicted HBP is 0, the accuracy is 0.79\n",
+ "(Threshold: 0.7), the total number of predicted HBP is 0, the accuracy is 0.79\n"
+ ]
+ }
+ ],
+ "source": [
+ "# we first choose a threshold to determine the pridiction \n",
+ "print(\"The true number of HBP is {}/{} from the data, with percentage {:.2f}%\\n\".format(\n",
+ " sum(y_test), len(y_test), 100.*sum(y_test)/len(y_test)))\n",
+ "for threshold in [0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7]:\n",
+ " pred_label = [1 if x[1]>threshold else 0 for x in pred_scores]\n",
+ " print(\"(Threshold: {}), the total number of predicted HBP is {}, the accuracy is {:.2f}\".format(\n",
+ " threshold, sum(pred_label), accuracy(pred_label,y_test)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "It seems like 0.3 is a reasonable value. Then we fix the threshold and adjust penalty and the C value"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(reg: l1, C: 0.01), the total number of predicted HBP is 504, the accuracy is 0.72\n",
+ "(reg: l1, C: 0.1), the total number of predicted HBP is 394, the accuracy is 0.74\n",
+ "(reg: l1, C: 1), the total number of predicted HBP is 376, the accuracy is 0.75\n",
+ "(reg: l1, C: 10), the total number of predicted HBP is 372, the accuracy is 0.75\n",
+ "(reg: l1, C: 100), the total number of predicted HBP is 377, the accuracy is 0.74\n",
+ "\n",
+ "(reg: l2, C: 0.01), the total number of predicted HBP is 397, the accuracy is 0.74\n",
+ "(reg: l2, C: 0.1), the total number of predicted HBP is 378, the accuracy is 0.75\n",
+ "(reg: l2, C: 1), the total number of predicted HBP is 372, the accuracy is 0.75\n",
+ "(reg: l2, C: 10), the total number of predicted HBP is 377, the accuracy is 0.74\n",
+ "(reg: l2, C: 100), the total number of predicted HBP is 377, the accuracy is 0.74\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# code\n",
+ "threshold = 0.3\n",
+ "penalties = ['l1', 'l2']\n",
+ "c_values = [10**-2, 10**-1, 1 , 10, 10**2]\n",
+ "for norm in penalties:\n",
+ " for reg_strength in c_values:\n",
+ " lrf = LogisticRegression(random_state=0, solver='liblinear', penalty=norm, C=reg_strength).fit(x_train, y_train)\n",
+ " pred_scores = lrf.predict_proba(x_test)\n",
+ " pred_label = [1 if x[1]>threshold else 0 for x in pred_scores]\n",
+ " print(\"(reg: {}, C: {}), the total number of predicted HBP is {}, the accuracy is {:.2f}\".format(\n",
+ " norm, reg_strength, sum(pred_label), accuracy(pred_label,y_test)))\n",
+ " print"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Understanding what's going on inside Logistic Regression"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To really see the difference between L1 and L2 regularization, we need to take a closer look at the models they produced. Plot a histogram of the weight values of LogisticRegression models for each C value. You can access these weight coefficients via the coef\\_ attribute in LogisticRegression. Do you notice anything interesting happening as the C value varies?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "# code\n",
+ "coefs = []\n",
+ "\n",
+ "for norm in penalties:\n",
+ " coef = []\n",
+ " for reg_strength in c_values:\n",
+ " lrf = LogisticRegression(random_state=0, solver='liblinear', penalty=norm, C=c_value).fit(x_train, y_train)\n",
+ " pred_scores = lrf.predict_proba(x_test)\n",
+ " pred_label = [1 if x[1]>threshold else 0 for x in pred_scores]\n",
+ " coef.append(lrf.coef_)\n",
+ " coefs.append(coef)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi40LCBodHRwOi8vbWF0cGxvdGxpYi5vcmcv7US4rQAAH1NJREFUeJzt3Xm8VXW9//HXG9CwQDAhTSDRJBVTy9BScyjN0ErrZgY3S7oO18pu06+i4ZrRnDctb3aLzOtQOWfRlbJMpcEJHJAQTZwCR1ASyQGRz++P73cvF5u9z14Hzj77HHw/H4/9OGv47u/6rLX3WZ+1vmvt71JEYGZmBjCg0wGYmVnf4aRgZmYFJwUzMys4KZiZWcFJwczMCk4KZmZWcFLoRZLeJ+l36/je+ZL27+GQ+jxJv5F0VF9dvqSzJH21Yl1jJYWkQT0XYXv1x5ht/TgpNCHpXkkH9mSdEfGziDiowrLX2tFExE4RcXV3llf6h16RX/dKmtrNsDsqIg6OiLP7wvIlTZH0556qW9IRkq6R9KSkq3uq3naSdLWkY3qwvk0lfVfS3/N39K48PqKnlpGXM13SHZJWS5rSYP4nJD0kabmkMyW9qDRvrKSr8ud0e1f7BUkvyu9fnuv7ZN38A3IdT+Y6t+7J9ewJTgovDMMjYghwOPCfkt7S0wvwkeQ6eQz4LvDNTgfSCZI2Bv4A7ARMBDYF9gQeBfbo4cXNBT4M3NQgjrcCU4EDgK2BbYEvl4qcB9wMbA58AbhY0sgmyzkJGJfreRPwGUkT83JGAL8A/hN4KTAHuGA916vnRYRfDV7AvcCBTeYdCywk/VPPALYqzTsIuAN4HPgBMAs4Js+bAvw5Dws4FXgEWA7MA14NHAc8C6wEVgC/ro8HGAh8HrgLeAK4ERjTIM6xQACDStNuAD5dGt8KuARYAtwD/Edp3ibA2cAyYAHwGWBx3Tb6LHAr8AwwqEV9e5D+EZYDDwOn5OmDgZ+Sdgb/AGYDW+R5V5e23wDgi8B9ebudAwyrW9ejgL8DS4EvNPn8tsnLGZDHfww8Upp/LvDx8vKBHYGngefy5/KPPP8s4HTgsvxZXA+8ssly1/o88vRjgKtbfB/3Bxbnz31p3vbvK81/EfBfed0fBn4IbFL33k/l7fYg8MHSe99G2uktBxYBJzWKGfhaXv+n8zb4fl7379TFOgP4RIX/sWNyrEN68f/6z8CUumk/B75eGj8AeCgPv4r03R5amv8n4Pgm9T8AHFQa/wpwfh4+DrimNO8lwFPADr21/lVePlPoJklvBr4BHAG8nLSDOj/PGwFcDHyOdFRxB7BXk6oOAvYlfemG5foejYjpwM+Ab0fEkIh4R4P3fhKYDBxCOrr6N+DJCrG/gZR4FubxAcCvSUdRo0j/DB/PR04AXyLtFLYF3gIc2aDayaSdynBgdYv6vgd8LyI2BV4JXJinH5W3wRjSdjue9M9Sb0p+vSnHNIS0Yyp7I7B9XvaJknasryQi7iHtAF+bJ+0LrCiV3Y+UzMvvWZDjujZ/LsNLsyeRjiw3I23brzWIvSdsCYwgbdujgOmSts/zvkn6Lr0G2C6XObHuvcPy9KOB0yVtluf9E/gA6TN8G/AhSe+sX3hEfIG0Qzwhb4MTSAcNk/N3qfY/cCBpR9vKgcBvI2JFtdUHSbdK+keT1w+q1lNnJ9J3tmYusIWkzfO8uyPiibr5OzWIbTPSPqG+rlrZNZYTEf8kHditVVcnOSl03/uAMyPipoh4hpQA9pQ0lrSTnh8Rv4iIVcBpwENN6nkWGArsACgiFkTEgxVjOAb4YkTcEcnciHi0i/JLJT0FXEs6e/llnr47MDIipkXEyoi4m3TUPCnPP4J0BLUsIhbn9al3WkQsioinKtT3LLCdpBERsSIiritN3xzYLiKei4gbI2J5g2W9j3R2cXfekXwOmFTXdPXliHgqIuaS/gF3bbJNZgH7Sdoyj1+cx7chJdq5Td7XyKURcUP+zH9G2jG3y39GxDMRMYt0dnKEJJGOQj8REY/lHdjXeX67Q9rG0yLi2YiYSTrS3x4gIq6OiHkRsToibiU1l+xXJZiIuIF0VnxAnjSJdNbzcIW3b046a6ksInaJiOFNXh/uTl0lQ0jrUFMbHtpgXm3+0Cb1lN9fX7Y7dXWMk0L3bUU6OwAg75weJR2BbUU6/a7NC9Jp+1oi4kqeP/1+JF8I27RiDGNIRxhVjSB9IT9FakrYKE/fGtiqfLRFap7YIs9fY33qhhtNa1Xf0aSj2dslzZb09jz9XOBy4HxJD0j6tqSNWNsa2z4PDyrVD2sm4Sd5/h+13izSttgX+COpmWi//PpTRKxu8r5Gqi5zfS3LR5c195G2yUjgxcCNpe3+2zy95tGctNaKU9Lr80XPJZIeJ50RdedC79k8fxZ5JOnzrOJR0pF1p60gHQjU1IafaDCvNv8J1raiNL9R2e7U1TFOCt33AGnnB4Ckl5COeO4nHfWMLs1TebxeRJwWEa8DxpN2lp+uzWoRwyJS80tl+Qj8FFJ7cO2IahFwT93R1tCIOCTPX2N9SMlorarr4mpaX0TcGRGTgZcB3yJdsHtJPnr9ckSMJzW3vZ3UnFFvjW0PvAJYRWqX7q5ZwD6kxDCL1Na8Nw2ajpqsaydslr9vNa8gbZOlpOa2nUrbfVikmwuq+DnpOsCYiBhGuh6hJmUbbYOfAodJ2pV07eWXDco0cgXw1rp16lK+NXtFk9cPq9ZTZz5rnlHuCjycz77nA9tKGlo3f359JRGxjPQ/U19Xreway8nr/cpGdXWSk0LXNpI0uPQaRDq1/qCk1+Tb1r4OXB8R95JO53eW9M5c9iOktty1SNo9H6FtRGrTfZrUJg9pJ7dtF3GdAXxF0jglu+T2zyq+SbojYjDpovMTkj4raRNJAyW9WtLuueyFwOckbSZpFHBCi7q7rE/SkZJG5qPwf+T3rJb0Jkk7SxpIaut/trQtys4DPiFpG0lDSNv+groj4Eoi4k7SjvRIYFZurnoYeDfNk8LDwOh818x6y9tnMOlsZ0D+jjU6Qyr7sqSNJe1DSp4X5e35Y+BUSS/LdY8qXctpZSjwWEQ8LWkP4F+7KLvWdzM3Lc4mnSFckpsSa+t4lqSzmtR1LulA4hJJO0gaIGlzSZ+XdEijN0S6NXtIk9fxzYLO22wwKdnV/q9r+79zgKMljZc0nHQzw1l5eX8DbgG+lN/zLmAX0s0UjZwDfDH/z+xAuimltv6XAq+W9O4cy4nArRFxe7O4O8FJoWszSTuO2uukiLiCdEvZJaSjgleS224jYinwHuDbpFPj8aS7bZ5pUPempH/kZaRmgEeBk/O8nwDjc1NAo6OuU0g77N+RdqI/Id0pVMVleZnHRsRzpB3La0h3Ci0lJZxhuew0UvPXPaSjuoubrAuQzkZa1DcRmC9pBemi86S8A9ky172cdJfTLBo3QZyZp/8x1/808NGK693ILFKzyqLSuGhw22J2Jemo7iFJS9djuTXvJ32v/od01vIU6TvRzEOkz+4B0rWL40s7lM+SLnJfJ2k56fPavmEta/swME3SE6Qd1YVdlP0ecLikZZLK15jOBnZm7c9tDPCXRhXla3IHArcDvyd9/jeQmq6urxh7Vb8jbd+9gOl5eN8cx29J/7NXke7euo90k0XNJGACadt/Ezg8IpZA8YPU8pH+l0hNu/eRvk8n5/rJ73k36UaEZcDrWfO6T5+g1Oxt7ZCPRBaTbh28qtPxrC9JHyLtyCtdhLSeo/Rr9p9GRNPmyE6StC+pGWnrfC2t9juEucAuEfFsJ+Oz6nym0MMkvVXS8Ny09HnSked1Ld7WJ0l6uaS982n99qQL1Zd2Oi7rW3KT18eAM6J0lJnvQNvRCaF/cVLoeXuSTh+XAu8A3lluY+1nNgZ+RLo74krgV6RbWs0AUPptxz9IdxF9t8PhWA9w85GZmRV8pmBmZoV+14nZiBEjYuzYsZ0Ow8ysX7nxxhuXRkSzjvwK/S4pjB07ljlz5nQ6DDOzfkXSfa1LufnIzMxKnBTMzKzgpGBmZgUnBTMzKzgpmJlZwUnBzMwKbUsKks6U9IikvzaZL0mnSVqo9Ii93doVi5mZVdPOM4WzSF0lN3MwMC6/jiN1H2xmZh3UtqQQEX8EHuuiyGHAOZFcBwyX1BcezWdm9oLVyWsKo1jz+b6L87S1SDpO0hxJc5YsWdLjgSye+qcer7Oqk046qU0VD2tdphvGTr1sjfHTj7+y2+9phwU77Mh33vv2pvPbtn2ptg06qoe/A9Z+O5+9c6dD6B8XmiNiekRMiIgJI0e27LrDzMzWUSeTwv2s+SD40XmamZl1SCeTwgzgA/kupDcAj0fEgx2Mx8zsBa9tvaRKOg/YHxghaTHpgdYbAUTED4GZwCGkh40/CXywXbGYmVk1bUsKETG5xfwAPtKu5ZuZWff1iwvNZmbWO5wUzMys4KRgZmYFJwUzMys4KZiZWcFJwczMCk4KZmZWcFIwM7OCk4KZmRWcFMzMrOCkYGZmBScFMzMrOCmYmVnBScHMzApOCmZmVnBSMDOzgpOCmZkVnBTMzKzgpGBmZgUnBTMzKzgpmJlZwUnBzMwKTgpmZlZwUjAzs4KTgpmZFZwUzMys4KRgZmYFJwUzMys4KZiZWcFJwczMCk4KZmZWcFIwM7NCW5OCpImS7pC0UNLUBvNfIekqSTdLulXSIe2Mx8zMuta2pCBpIHA6cDAwHpgsaXxdsS8CF0bEa4FJwA/aFY+ZmbXWzjOFPYCFEXF3RKwEzgcOqysTwKZ5eBjwQBvjMTOzFtqZFEYBi0rji/O0spOAIyUtBmYCH21UkaTjJM2RNGfJkiXtiNXMzOj8hebJwFkRMRo4BDhX0loxRcT0iJgQERNGjhzZ60Gamb1QtDMp3A+MKY2PztPKjgYuBIiIa4HBwIg2xmRmZl1oZ1KYDYyTtI2kjUkXkmfUlfk7cACApB1JScHtQ2ZmHdK2pBARq4ATgMuBBaS7jOZLmibp0FzsU8CxkuYC5wFTIiLaFZOZmXVtUDsrj4iZpAvI5WknloZvA/ZuZwxmZlZdpy80m5lZH+KkYGZmBScFMzMrOCmYmVnBScHMzApOCmZmVnBSMDOzgpOCmZkVnBTMzKzgpGBmZgUnBTMzKzgpmJlZwUnBzMwKTgpmZlZwUjAzs4KTgpmZFZwUzMys4KRgZmYFJwUzMys4KZiZWcFJwczMCk4KZmZWcFIwM7OCk4KZmRWcFMzMrOCkYGZmBScFMzMrVEoKknZudyBmZtZ5Vc8UfiDpBkkfljSsrRGZmVnHVEoKEbEP8D5gDHCjpJ9LektbIzMzs15X+ZpCRNwJfBH4LLAfcJqk2yX9S7uCMzOz3lX1msIukk4FFgBvBt4RETvm4VPbGJ+ZmfWiqmcK/w3cBOwaER+JiJsAIuIB0tlDQ5ImSrpD0kJJU5uUOULSbZLmS/p5d1fAzMx6zqCK5d4GPBURzwFIGgAMjognI+LcRm+QNBA4HXgLsBiYLWlGRNxWKjMO+Bywd0Qsk/Sy9VgXMzNbT1XPFK4ANimNvzhP68oewMKIuDsiVgLnA4fVlTkWOD0ilgFExCMV4zEzszaomhQGR8SK2kgefnGL94wCFpXGF+dpZa8CXiXpL5KukzSxYjxmZtYGVZPCPyXtVhuR9DrgqR5Y/iBgHLA/MBn4saTh9YUkHSdpjqQ5S5Ys6YHFmplZI1WvKXwcuEjSA4CALYH3tnjP/aTfNdSMztPKFgPXR8SzwD2S/kZKErPLhSJiOjAdYMKECVExZjMz66ZKSSEiZkvaAdg+T7oj78i7MhsYJ2kbUjKYBPxrXZlfks4Q/lfSCFJz0t1Vgzczs55V9UwBYHdgbH7PbpKIiHOaFY6IVZJOAC4HBgJnRsR8SdOAORExI887SNJtwHPApyPi0XVcFzMzW0+VkoKkc4FXAreQdt4AATRNCgARMROYWTftxNJwAJ/MLzMz67CqZwoTgPF5J25mZhuoqncf/ZV0cdnMzDZgVc8URgC3SboBeKY2MSIObUtUZmbWEVWTwkntDMLMzPqGqrekzpK0NTAuIq6Q9GLSHUVmZrYBqdp19rHAxcCP8qRRpN8YmJnZBqTqheaPAHsDy6F44I57NDUz28BUTQrP5J5OAZA0iPQ7BTMz24BUTQqzJH0e2CQ/m/ki4NftC8vMzDqhalKYCiwB5gH/TvqVctMnrpmZWf9U9e6j1cCP88vMzDZQVfs+uocG1xAiYtsej8jMzDqmO30f1QwG3gO8tOfDMTOzTqp0TSEiHi297o+I7wJva3NsZmbWy6o2H+1WGh1AOnPozrMYzMysH6i6Y/9OaXgVcC9wRI9HY2ZmHVX17qM3tTsQMzPrvKrNR10+GS0iTumZcMzMrJO6c/fR7sCMPP4O4AbgznYEZWZmnVE1KYwGdouIJwAknQRcFhFHtiswMzPrfVW7udgCWFkaX5mnmZnZBqTqmcI5wA2SLs3j7wTObk9IZmbWKVXvPvqapN8A++RJH4yIm9sXlpmZdULV5iOAFwPLI+J7wGJJ27QpJjMz65Cqj+P8EvBZ4HN50kbAT9sVlJmZdUbVM4V3AYcC/wSIiAeAoe0KyszMOqNqUlgZEUHuPlvSS9oXkpmZdUrVpHChpB8BwyUdC1yBH7hjZrbBqXr30X/lZzMvB7YHToyI37c1MjMz63Utk4KkgcAVuVM8JwIzsw1Yy+ajiHgOWC1pWC/EY2ZmHVT1F80rgHmSfk++AwkgIv6jLVGZmVlHVE0Kv8gvMzPbgHWZFCS9IiL+HhHr1M+RpInA94CBwBkR8c0m5d4NXAzsHhFz1mVZZma2/lpdU/hlbUDSJd2pOF+gPh04GBgPTJY0vkG5ocDHgOu7U7+ZmfW8VklBpeFtu1n3HsDCiLg7IlYC5wOHNSj3FeBbwNPdrN/MzHpYq6QQTYarGAUsKo0vztMKknYDxkTEZV1VJOk4SXMkzVmyZEk3wzAzs6paXWjeVdJy0hnDJnmYPB4Rsem6LljSAOAUYEqrshExHZgOMGHChO4mJzMzq6jLpBARA9ej7vuBMaXx0XlazVDg1cDVkgC2BGZIOtQXm83MOqM7z1PortnAOEnbSNoYmATMqM2MiMcjYkREjI2IscB1gBOCmVkHtS0pRMQq4ATgcmABcGFEzJc0TdKh7VqumZmtu6o/XlsnETETmFk37cQmZfdvZyxmZtZaO5uPzMysn3FSMDOzgpOCmZkVnBTMzKzgpGBmZgUnBTMzKzgpmJlZwUnBzMwKTgpmZlZwUjAzs4KTgpmZFZwUzMys4KRgZmYFJwUzMys4KZiZWcFJwczMCk4KZmZWcFIwM7OCk4KZmRWcFMzMrOCkYGZmBScFMzMrOCmYmVnBScHMzApOCmZmVnBSMDOzgpOCmZkVnBTMzKzgpGBmZgUnBTMzKzgpmJlZwUnBzMwKbU0KkiZKukPSQklTG8z/pKTbJN0q6Q+Stm5nPGZm1rW2JQVJA4HTgYOB8cBkSePrit0MTIiIXYCLgW+3Kx4zM2utnWcKewALI+LuiFgJnA8cVi4QEVdFxJN59DpgdBvjMTOzFtqZFEYBi0rji/O0Zo4GftNohqTjJM2RNGfJkiU9GKKZmZX1iQvNko4EJgAnN5ofEdMjYkJETBg5cmTvBmdm9gIyqI113w+MKY2PztPWIOlA4AvAfhHxTBvjMTOzFtp5pjAbGCdpG0kbA5OAGeUCkl4L/Ag4NCIeaWMsZmZWQduSQkSsAk4ALgcWABdGxHxJ0yQdmoudDAwBLpJ0i6QZTaozM7Ne0M7mIyJiJjCzbtqJpeED27l8MzPrnj5xodnMzPoGJwUzMys4KZiZWcFJwczMCk4KZmZWcFIwM7OCk4KZmRWcFMzMrOCkYGZmBScFMzMrOCmYmVnBScHMzApOCmZmVnBSMDOzgpOCmZkVnBTMzKzgpGBmZgUnBTMzKzgpmJlZwUnBzMwKTgpmZlZwUjAzs4KTgpmZFZwUzMys4KRgZmYFJwUzMys4KZiZWcFJwczMCk4KZmZWcFIwM7OCk4KZmRWcFMzMrOCkYGZmhbYmBUkTJd0haaGkqQ3mv0jSBXn+9ZLGtjMeMzPrWtuSgqSBwOnAwcB4YLKk8XXFjgaWRcR2wKnAt9oVj5mZtdbOM4U9gIURcXdErATOBw6rK3MYcHYevhg4QJLaGJOZmXVBEdGeiqXDgYkRcUwefz/w+og4oVTmr7nM4jx+Vy6ztK6u44Dj8uj2wB1NFjsCWNpkXl/WX+OG/hu74+5djrt3NYp764gY2eqNg9oTT8+KiOnA9FblJM2JiAm9EFKP6q9xQ/+N3XH3Lsfdu9Yn7nY2H90PjCmNj87TGpaRNAgYBjzaxpjMzKwL7UwKs4FxkraRtDEwCZhRV2YGcFQePhy4MtrVnmVmZi21rfkoIlZJOgG4HBgInBkR8yVNA+ZExAzgJ8C5khYCj5ESx/po2cTUR/XXuKH/xu64e5fj7l3rHHfbLjSbmVn/4180m5lZwUnBzMwK/TopSHqPpPmSVktqevuVpHslzZN0i6Q5vRljk3iqxt1lNyGdIOmlkn4v6c78d7Mm5Z7L2/sWSfU3GPSK/trNSoW4p0haUtq+x3QiznqSzpT0SP79UaP5knRaXq9bJe3W2zE2UiHu/SU9XtreJ/Z2jI1IGiPpKkm35f3JxxqU6f42j4h++wJ2JP2Y7WpgQhfl7gVGdDre7sRNujh/F7AtsDEwFxjfB2L/NjA1D08FvtWk3IoOx9ly+wEfBn6YhycBF/SB7Vsl7inA9zsda4PY9wV2A/7aZP4hwG8AAW8Aru90zBXj3h/4v07H2SCulwO75eGhwN8afFe6vc379ZlCRCyIiGa/bu6zKsZdpZuQTih3TXI28M4OxtKV/trNSl/93FuKiD+S7iJs5jDgnEiuA4ZLennvRNdchbj7pIh4MCJuysNPAAuAUXXFur3N+3VS6IYAfifpxtxlRn8wClhUGl/M2h94J2wREQ/m4YeALZqUGyxpjqTrJHUicVTZfkWZiFgFPA5s3ivRNVf1c393bg64WNKYBvP7or76na5iT0lzJf1G0k6dDqZebvp8LXB93axub/M+382FpCuALRvM+kJE/KpiNW+MiPslvQz4vaTb89FB2/RQ3B3RVezlkYgISc3uad46b/NtgSslzYuIu3o61heoXwPnRcQzkv6ddLbz5g7HtCG7ifR9XiHpEOCXwLgOx1SQNAS4BPh4RCxf3/r6fFKIiAN7oI77899HJF1KOkVva1LogbirdBPSFl3FLulhSS+PiAfzaegjTeqobfO7JV1NOorpzaTQnW5WFvehblZaxh0R5RjPIF3n6Q869p1eH+UdbUTMlPQDSSOiruPOTpC0ESkh/CwiftGgSLe3+QbffCTpJZKG1oaBg4CGdxn0MVW6CemEctckRwFrnfVI2kzSi/LwCGBv4LZeizDpr92stIy7rk34UFJbcn8wA/hAviPmDcDjpabIPkvSlrVrTZL2IO03O33wQI7pJ8CCiDilSbHub/NOX0Ffz6vv7yK1kT0DPAxcnqdvBczMw9uS7uCYC8wnNd/0+bjj+TsH/kY6wu543DmmzYE/AHcCVwAvzdMnAGfk4b2AeXmbzwOO7lCsa20/YBpwaB4eDFwELARuALbt9PatGPc38nd5LnAVsEOnY85xnQc8CDybv99HA8cDx+f5Ij146678vWh6x2Afi/uE0va+Dtir0zHnuN5Iul56K3BLfh2yvtvc3VyYmVlhg28+MjOz6pwUzMys4KRgZmYFJwUzMys4KZiZWcFJwfo1SadK+nhp/HJJZ5TGvyPpky3quKbCcu7Nv7mon76/pL26G3eTZUyR9P2eqMtsXTkpWH/3F9LvIpA0ABgBlPum2QvocqcfEeuzU9+/tnyzDYGTgvV31wB75uGdSL9Wf6L0q+odSX3XIOnTkmbnjuS+XKtA0or8d0DuwuB2pWdFzJR0eGlZH5V0k9KzOXbInZAdD3wi97O/T6nOAfnsYnhp2p2StpD0DqXnN9ws6QpJa3UqKOms8rJrMTZbj/zL/ctyp21/lfTedd6i9oLmpGD9WkQ8AKyS9ArSEfu1pJ4i9yT9ynpeRKyUdBCpE7M9gNcAr5O0b111/wKMBcYD7+f5ZFOzNCJ2A/4H+H8RcS/wQ+DUiHhNRPypFNdqUhcg7wKQ9Hrgvoh4GPgz8IaIeC2pa+zPVF3fLtZjIvBAROwaEa8Gflu1TrMyJwXbEFxDSgi1pHBtafwvucxB+XUz6cxhB9bu6fKNwEURsToiHiJ1IVFW63DsRlLyaOUCoHbEPimPQ+qU7HJJ84BPs2ZzVyvN1mMe8BZJ35K0T0Q83o06zQp9vpdUswpq1xV2JjUfLQI+BSwH/jeXEfCNiPjReiznmfz3Oar971wLbCdpJOlhRF/N0/8bOCUiZkjaHzipwXtXkQ/a8rWSjfP0puuh9KjFQ4CvSvpDREyrslJmZT5TsA3BNcDbgcci4rmIeAwYTmr+qV1kvhz4t9z3PJJG5edrlP2F9PCaAbmdf/8Ky36C9CjEtUTqWOxS4BRST5a1njWH8Xz3xUc1ei/pEbKvy8OHAht1tR6StgKejIifAieTHi9p1m0+U7ANwTzSXUc/r5s2JHKf9xHxO0k7AtfmXpBXAEey5vMgLgEOIHXzvYjUPNOqGebXwMWSDgM+Wr6ukF1A6g57SmnaScBFkpYBVwLbNKj3x8CvJM0lXR/4Z4v12A44WdJqUm+fH2oRt1lD7iXVrETSkEhP2Nqc1J323vn6gtkLgs8UzNb0f/k20o2Brzgh2AuNzxTMzKzgC81mZlZwUjAzs4KTgpmZFZwUzMys4KRgZmaF/w+kjehhgv6fWQAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "num_C = 5\n",
+ "num_penalty = 2\n",
+ "index = 1\n",
+ "for p_i in range(num_penalty):\n",
+ " for c_i in range(num_C):\n",
+ " plt.hist(coefs[p_i][c_i], bins=30)\n",
+ " plt.title('Logistic Regression with %s penalty, C = %.2f' %(penalties[p_i], c_values[c_i]))\n",
+ " plt.xlabel('Weight values')\n",
+ " plt.ylabel('Frequency')\n",
+ " index += 1\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "you can think of the \"C\" parameter as a way of specifying the tradeoff between the L1/L2 penalty and the negative log likelihood of the model. \n",
+ "\n",
+ "A small value of \"C\" means that we are assigning greater weight to the L1/L2 penalty(equivalently downweighting the negative log likelihood).\n",
+ "\n",
+ "L1 regularization induces sparse models - this can be a very useful if you suspect your classification target variable can be explained by few features.\n",
+ "\n",
+ "* Sparsity refers to that only very few entries in a matrix (or vector) is non-zero. L1-norm has the property of producing many coefficients with zero values or very small values with few large coefficients."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Support Vector Machines"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Ref: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC\n",
+ "The SVM Classifier also takes quite a few parameters. For now we will use Linear SVMs. The model is called LinearSVC in sklearn.\n",
+ "\n",
+ "We will be playing with following parameters:\n",
+ "* C: same as above\n",
+ "\n",
+ "SVM tries to find the hyperplane that maximizes the \"margin\" between the two classes of points. The \"C\" parameter in [SVC](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC) has the same role as the \"C\" parameter in LogisticRegression: it tells you how much to penalize the \"size\" of the weight vector. Note that SVC only allows for L2 regularization.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Let's fit an SVM"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 117,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[-0.51611687 -0.29820437 -0.51272115 -0.51611687 -0.35975434 -0.58490291\n",
+ " -0.74248609 -0.58490291 -0.51611687 -0.57761275]\n",
+ "the average of the confidence score is: -0.56\n"
+ ]
+ }
+ ],
+ "source": [
+ "svm = LinearSVC(random_state=0, tol=1e-5)\n",
+ "svm.fit(x_train, y_train)\n",
+ "confidence_score = svm.decision_function(x_test)\n",
+ "print(confidence_score[:10])\n",
+ "print(\"the average of the confidence score is: %.2f\" % confidence_score.mean())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now predict scores on the test set and plot the distribution of scores\n",
+ "You might notice that the function you've been using to predict so far does not work. Is another function you need to use? Which one? Why?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true
+ },
+ "source": [
+ "The confidence score for a sample is the signed distance of that sample to the hyperplane.\n",
+ "\n",
+ "return:\n",
+ "array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)\n",
+ "\n",
+ "Confidence scores per (sample, class) combination. In the binary case, confidence score for self.classes_[1] where >0 means this class would be predicted."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### now we can select a threshold and calculate accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 123,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The true number of HBP is 541/2602 from the data, with percentage 20.79%\n",
+ "\n",
+ "(Threshold: -1), the total number of predicted HBP is 2598, the accuracy is 0.21\n",
+ "(Threshold: -0.9), the total number of predicted HBP is 2581, the accuracy is 0.22\n",
+ "(Threshold: -0.8), the total number of predicted HBP is 2460, the accuracy is 0.25\n",
+ "(Threshold: -0.7), the total number of predicted HBP is 2130, the accuracy is 0.34\n",
+ "(Threshold: -0.6), the total number of predicted HBP is 1638, the accuracy is 0.47\n",
+ "(Threshold: -0.5), the total number of predicted HBP is 631, the accuracy is 0.69\n",
+ "(Threshold: -0.4), the total number of predicted HBP is 393, the accuracy is 0.74\n",
+ "(Threshold: -0.3), the total number of predicted HBP is 291, the accuracy is 0.76\n",
+ "(Threshold: -0.2), the total number of predicted HBP is 26, the accuracy is 0.79\n",
+ "(Threshold: -0.1), the total number of predicted HBP is 0, the accuracy is 0.79\n",
+ "(Threshold: 0), the total number of predicted HBP is 0, the accuracy is 0.79\n"
+ ]
+ }
+ ],
+ "source": [
+ "# code\n",
+ "print(\"The true number of HBP is {}/{} from the data, with percentage {:.2f}%\\n\".format(\n",
+ " sum(y_test), len(y_test), 100.*sum(y_test)/len(y_test)))\n",
+ "for threshold in [-1, -0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0] : # , -0.6, ]:\n",
+ " pred_label = [1 if x >threshold else 0 for x in confidence_score]\n",
+ " print(\"(Threshold: {}), the total number of predicted HBP is {}, the accuracy is {:.2f}\".format(\n",
+ " threshold, sum(pred_label), accuracy(pred_label,y_test)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "-0.4 is an ideal threshold."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Let's now vary values of C and see the results."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 131,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(C: 0.01), the total number of predicted HBP is 113, the accuracy is 0.78\n",
+ "(C: 0.1), the total number of predicted HBP is 113, the accuracy is 0.78\n",
+ "(C: 1), the total number of predicted HBP is 113, the accuracy is 0.78\n",
+ "(C: 10), the total number of predicted HBP is 113, the accuracy is 0.78\n",
+ "(C: 100), the total number of predicted HBP is 113, the accuracy is 0.78\n"
+ ]
+ }
+ ],
+ "source": [
+ "# code\n",
+ "threshold = -0.4\n",
+ "c_values = [10**-2, 10**-1, 1 , 10, 10**2]\n",
+ "for c_value in c_values:\n",
+ " svm = LinearSVC(random_state=0, tol=1e-5, C=c_value).fit(x_train, y_train)\n",
+ " confidence_score = lrf.decision_function(x_test)\n",
+ " pred_label = [1 if x > threshold else 0 for x in confidence_score]\n",
+ " print(\"(C: {}), the total number of predicted HBP is {}, the accuracy is {:.2f}\".format(\n",
+ " c_value, sum(pred_label), accuracy(pred_label, y_test)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Varing the value of C doesn't have much effect on accuracy for this dataset under the current setting of the model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Evaluation Metrics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We covered several evaluation metrics in class:\n",
+ " - accuracy\n",
+ " - precision\n",
+ " - recall\n",
+ " - area under curve\n",
+ " - ROC curves\n",
+ " \n",
+ "Although sklearn has built-in functions to calculate these metrics,\n",
+ "in this lab we want to give you an understanding of these metrics \n",
+ "by writing functions to calculate them yourself.\n",
+ "\n",
+ "Remember that accuracy, precision, and recall are calculated at a specific threshold for turning scores into 0 and 1.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Set Threshold\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 137,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "threshold = -0.4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### We will first create a confusion matrix based on this threshold"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 138,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "svm = LinearSVC(random_state=0, tol=1e-5).fit(x_train, y_train)\n",
+ "confidence_score = lrf.decision_function(x_test)\n",
+ "pred_label = [1 if x > threshold else 0 for x in confidence_score]\n",
+ "c = confusion_matrix(y_test, pred_label)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "I create a confusion matrix for our SVM classifier."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 141,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1987, 74, 502, 39)"
+ ]
+ },
+ "execution_count": 141,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "true_negatives, false_positive, false_negatives, true_positives = c.ravel()\n",
+ "(true_negatives, false_positive, false_negatives, true_positives)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Let's now write functions that can calculate each metric"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 164,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def calculate_accuracy_at_threshold(predicted_scores, true_labels, threshold):\n",
+ " pred_label = [1 if x > threshold else 0 for x in predicted_scores]\n",
+ " true_negatives, false_positive, false_negatives, true_positives = confusion_matrix(y_test, pred_label).ravel()\n",
+ " return 1.0 * (true_positives + true_negatives) / (true_negatives + false_positive + false_negatives + true_positives)\n",
+ "\n",
+ "def calculate_precision_at_threshold(predicted_scores, true_labels, threshold):\n",
+ " pred_label = [1 if x > threshold else 0 for x in predicted_scores]\n",
+ " _, false_positive, _, true_positives = confusion_matrix(y_test, pred_label).ravel()\n",
+ " return 1.0 * true_positives / (false_positive + true_positives)\n",
+ "\n",
+ "def calculate_recall_at_threshold(predicted_scores, true_labels, threshold):\n",
+ " pred_label = [1 if x > threshold else 0 for x in predicted_scores]\n",
+ " _, _, false_negatives, true_positives = confusion_matrix(y_test, pred_label).ravel()\n",
+ " return 1.0 * true_positives / (false_negatives + true_positives)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Accuracy = TP+TN/TP+FP+FN+TN\n",
+ "\n",
+ "Precision = TP/TP+FP\n",
+ "\n",
+ "Recall = TP/TP+FN"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now let's calculate all of these for a logistic regression model you built above"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The accuracy is 0.75\n",
+ "The precision is 0.34\n",
+ "The recall is 0.23\n"
+ ]
+ }
+ ],
+ "source": [
+ "threshold = 0.3\n",
+ "lrf = LogisticRegression(random_state=0, solver='liblinear').fit(x_train, y_train)\n",
+ "predicted_scores = lrf.predict_proba(x_test)\n",
+ "# print(predicted_scores)\n",
+ "# print(predicted_scores[:,1])\n",
+ "\n",
+ "print(\"The accuracy is %.2f\" % calculate_accuracy_at_threshold(predicted_scores[:,1], y_test, threshold))\n",
+ "print(\"The precision is %.2f\" % calculate_precision_at_threshold(predicted_scores[:,1], y_test, threshold))\n",
+ "print(\"The recall is %.2f\" % calculate_recall_at_threshold(predicted_scores[:,1], y_test, threshold))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now let's write a function that generates the precision, recall, k (% of population) graph that we covered in class\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 169,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def plot_precision_recall_k(predicted_scores, true_labels):\n",
+ " precision, recall, thresholds = precision_recall_curve(true_labels, predicted_scores)\n",
+ " plt.plot(recall, precision, marker='.')\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### let's plot it for the same logistic regression model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plot_precision_recall_k(predicted_scores[:,1], y_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we build the same graph for an svm model and compare the two. Which one is better?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "svm = LinearSVC(random_state=0, tol=1e-5).fit(x_train, y_train)\n",
+ "confidence_score = lrf.decision_function(x_test)\n",
+ "plot_precision_recall_k(predicted_scores[:,1], y_test)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}