diff --git a/01_materials/notebooks/ClassW_Jan_7_Classfication-1.ipynb b/01_materials/notebooks/ClassW_Jan_7_Classfication-1.ipynb new file mode 100644 index 000000000..cabdaceff --- /dev/null +++ b/01_materials/notebooks/ClassW_Jan_7_Classfication-1.ipynb @@ -0,0 +1,8605 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# load in libraries\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "from mpl_toolkits import mplot3d" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0842302M17.9910.38122.801001.00.118400.277600.300100.14710...25.38017.33184.602019.00.162200.665600.71190.26540.46010.11890
1842517M20.5717.77132.901326.00.084740.078640.086900.07017...24.99023.41158.801956.00.123800.186600.24160.18600.27500.08902
284300903M19.6921.25130.001203.00.109600.159900.197400.12790...23.57025.53152.501709.00.144400.424500.45040.24300.36130.08758
384348301M11.4220.3877.58386.10.142500.283900.241400.10520...14.91026.5098.87567.70.209800.866300.68690.25750.66380.17300
484358402M20.2914.34135.101297.00.100300.132800.198000.10430...22.54016.67152.201575.00.137400.205000.40000.16250.23640.07678
..................................................................
564926424M21.5622.39142.001479.00.111000.115900.243900.13890...25.45026.40166.102027.00.141000.211300.41070.22160.20600.07115
565926682M20.1328.25131.201261.00.097800.103400.144000.09791...23.69038.25155.001731.00.116600.192200.32150.16280.25720.06637
566926954M16.6028.08108.30858.10.084550.102300.092510.05302...18.98034.12126.701124.00.113900.309400.34030.14180.22180.07820
567927241M20.6029.33140.101265.00.117800.277000.351400.15200...25.74039.42184.601821.00.165000.868100.93870.26500.40870.12400
56892751B7.7624.5447.92181.00.052630.043620.000000.00000...9.45630.3759.16268.60.089960.064440.00000.00000.28710.07039
\n", + "

569 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", + "0 842302 M 17.99 10.38 122.80 1001.0 \n", + "1 842517 M 20.57 17.77 132.90 1326.0 \n", + "2 84300903 M 19.69 21.25 130.00 1203.0 \n", + "3 84348301 M 11.42 20.38 77.58 386.1 \n", + "4 84358402 M 20.29 14.34 135.10 1297.0 \n", + ".. ... ... ... ... ... ... \n", + "564 926424 M 21.56 22.39 142.00 1479.0 \n", + "565 926682 M 20.13 28.25 131.20 1261.0 \n", + "566 926954 M 16.60 28.08 108.30 858.1 \n", + "567 927241 M 20.60 29.33 140.10 1265.0 \n", + "568 92751 B 7.76 24.54 47.92 181.0 \n", + "\n", + " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", + "0 0.11840 0.27760 0.30010 0.14710 \n", + "1 0.08474 0.07864 0.08690 0.07017 \n", + "2 0.10960 0.15990 0.19740 0.12790 \n", + "3 0.14250 0.28390 0.24140 0.10520 \n", + "4 0.10030 0.13280 0.19800 0.10430 \n", + ".. ... ... ... ... \n", + "564 0.11100 0.11590 0.24390 0.13890 \n", + "565 0.09780 0.10340 0.14400 0.09791 \n", + "566 0.08455 0.10230 0.09251 0.05302 \n", + "567 0.11780 0.27700 0.35140 0.15200 \n", + "568 0.05263 0.04362 0.00000 0.00000 \n", + "\n", + " ... radius_worst texture_worst perimeter_worst area_worst \\\n", + "0 ... 25.380 17.33 184.60 2019.0 \n", + "1 ... 24.990 23.41 158.80 1956.0 \n", + "2 ... 23.570 25.53 152.50 1709.0 \n", + "3 ... 14.910 26.50 98.87 567.7 \n", + "4 ... 22.540 16.67 152.20 1575.0 \n", + ".. ... ... ... ... ... \n", + "564 ... 25.450 26.40 166.10 2027.0 \n", + "565 ... 23.690 38.25 155.00 1731.0 \n", + "566 ... 18.980 34.12 126.70 1124.0 \n", + "567 ... 25.740 39.42 184.60 1821.0 \n", + "568 ... 9.456 30.37 59.16 268.6 \n", + "\n", + " smoothness_worst compactness_worst concavity_worst \\\n", + "0 0.16220 0.66560 0.7119 \n", + "1 0.12380 0.18660 0.2416 \n", + "2 0.14440 0.42450 0.4504 \n", + "3 0.20980 0.86630 0.6869 \n", + "4 0.13740 0.20500 0.4000 \n", + ".. ... ... ... \n", + "564 0.14100 0.21130 0.4107 \n", + "565 0.11660 0.19220 0.3215 \n", + "566 0.11390 0.30940 0.3403 \n", + "567 0.16500 0.86810 0.9387 \n", + "568 0.08996 0.06444 0.0000 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + ".. ... ... ... \n", + "564 0.2216 0.2060 0.07115 \n", + "565 0.1628 0.2572 0.06637 \n", + "566 0.1418 0.2218 0.07820 \n", + "567 0.2650 0.4087 0.12400 \n", + "568 0.0000 0.2871 0.07039 \n", + "\n", + "[569 rows x 32 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# loading the data , make sure you know where you are. \n", + "cancer = pd.read_csv('dataset/wdbc.csv')\n", + "cancer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 569 entries, 0 to 568\n", + "Data columns (total 32 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 569 non-null int64 \n", + " 1 diagnosis 569 non-null object \n", + " 2 radius_mean 569 non-null float64\n", + " 3 texture_mean 569 non-null float64\n", + " 4 perimeter_mean 569 non-null float64\n", + " 5 area_mean 569 non-null float64\n", + " 6 smoothness_mean 569 non-null float64\n", + " 7 compactness_mean 569 non-null float64\n", + " 8 concavity_mean 569 non-null float64\n", + " 9 concave points_mean 569 non-null float64\n", + " 10 symmetry_mean 569 non-null float64\n", + " 11 fractal_dimension_mean 569 non-null float64\n", + " 12 radius_se 569 non-null float64\n", + " 13 texture_se 569 non-null float64\n", + " 14 perimeter_se 569 non-null float64\n", + " 15 area_se 569 non-null float64\n", + " 16 smoothness_se 569 non-null float64\n", + " 17 compactness_se 569 non-null float64\n", + " 18 concavity_se 569 non-null float64\n", + " 19 concave points_se 569 non-null float64\n", + " 20 symmetry_se 569 non-null float64\n", + " 21 fractal_dimension_se 569 non-null float64\n", + " 22 radius_worst 569 non-null float64\n", + " 23 texture_worst 569 non-null float64\n", + " 24 perimeter_worst 569 non-null float64\n", + " 25 area_worst 569 non-null float64\n", + " 26 smoothness_worst 569 non-null float64\n", + " 27 compactness_worst 569 non-null float64\n", + " 28 concavity_worst 569 non-null float64\n", + " 29 concave points_worst 569 non-null float64\n", + " 30 symmetry_worst 569 non-null float64\n", + " 31 fractal_dimension_worst 569 non-null float64\n", + "dtypes: float64(30), int64(1), object(1)\n", + "memory usage: 142.4+ KB\n" + ] + } + ], + "source": [ + "#take a close look at data, no of column, rows, data type, non null values\n", + "cancer.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['M', 'B'], dtype=object)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# investigate and return unique cateogry in a column\n", + "cancer[\"diagnosis\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Malignant', 'Benign'], dtype=object)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rename using a dictionary becasue they are object data type\n", + "cancer[\"diagnosis\"] = cancer['diagnosis'].replace({\n", + " \"M\": \"Malignant\", \n", + " \"B\": \"Benign\"\n", + " })\n", + "cancer[\"diagnosis\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "diagnosis\n", + "Benign 357\n", + "Malignant 212\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# use value count for good and bad case\n", + "cancer['diagnosis'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "diagnosis\n", + "Benign 0.627417\n", + "Malignant 0.372583\n", + "Name: proportion, dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# % count?\n", + "cancer['diagnosis'].value_counts(normalize = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "diagnosis\n", + "Benign 357\n", + "Malignant 212\n", + "dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# use groupby category in a column\n", + "cancer.groupby(\"diagnosis\").size()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "diagnosis\n", + "Benign 62.741652\n", + "Malignant 37.258348\n", + "dtype: float64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#get % by getting the length of this column\n", + "(cancer.groupby(\"diagnosis\").size() / cancer.shape[0])*100" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# use maplotlit\n", + "# Create mapping between values and colors\n", + "labels = cancer[\"diagnosis\"].unique().tolist()\n", + "colors = list(mcolors.TABLEAU_COLORS.keys())\n", + "color_map = {l: colors[i % len(colors)] for i, l in enumerate(labels)}\n", + "\n", + "# Plot\n", + "plt.scatter(cancer[\"perimeter_mean\"], cancer['concavity_mean'], \n", + " color=cancer[\"diagnosis\"].map(color_map))\n", + "\n", + "# Create custom legend handles\n", + "handles = [plt.Line2D([0], [0], marker='o', color='w', label=label,\n", + " markersize=10, markerfacecolor=color_map[label])\n", + " for label in labels]\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Perimeter Mean')\n", + "plt.ylabel('Concavity Mean')\n", + "plt.title('Scatter Plot of Perimeter Mean vs Concavity Mean')\n", + "plt.legend(handles=handles, title='Diagnosis')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot existing data\n", + "plt.scatter(cancer[\"perimeter_mean\"], cancer['concavity_mean'], \n", + " color=cancer[\"diagnosis\"].map(color_map))\n", + "\n", + "# Create custom legend handles\n", + "handles = [plt.Line2D([0], [0], marker='o', color='w', label=label,\n", + " markersize=10, markerfacecolor=color_map[label])\n", + " for label in labels]\n", + "\n", + "# Add new observation\n", + "new_observation = {'perimeter_mean': 97, 'concavity_mean': 0.20}\n", + "plt.scatter(new_observation['perimeter_mean'], new_observation['concavity_mean'],\n", + " color='red', edgecolor='black', s=100, label='New Observation')\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Perimeter Mean')\n", + "plt.ylabel('Concavity Mean')\n", + "plt.title('Scatter Plot of Perimeter Mean vs Concavity Mean')\n", + "plt.legend(handles=handles + [plt.Line2D([0], [0], marker='o', color='w', \n", + " markerfacecolor='red', markeredgecolor='black', \n", + " markersize=10, label='New Observation')], \n", + " title='Diagnosis')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# usuall standardize the data first, but not for here\n", + "# new observation\n", + "new_obs_Perimeter = 97\n", + "new_obs_Concavity = 0.20" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "#dist calculation\n", + "\n", + "cancer['dist_from_new'] =(\n", + "\n", + "(cancer ['perimeter_mean'] - new_obs_Perimeter)**2 +\n", + "(cancer['concavity_mean'] - new_obs_Concavity)**2 \n", + ") ** (1/2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 25.800194\n", + "1 35.900178\n", + "2 33.000000\n", + "3 19.420044\n", + "4 38.100000\n", + " ... \n", + "564 45.000021\n", + "565 34.200046\n", + "566 11.300511\n", + "567 43.100266\n", + "568 49.080407\n", + "Name: dist_from_new, Length: 569, dtype: float64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer ['dist_from_new']" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
perimeter_meanconcavity_meandiagnosisdist_from_new
29197.030.05940Benign0.143765
13896.850.15390Malignant0.156924
1596.730.16390Malignant0.272403
51497.260.07486Malignant0.288548
5497.260.05253Malignant0.298910
\n", + "
" + ], + "text/plain": [ + " perimeter_mean concavity_mean diagnosis dist_from_new\n", + "291 97.03 0.05940 Benign 0.143765\n", + "138 96.85 0.15390 Malignant 0.156924\n", + "15 96.73 0.16390 Malignant 0.272403\n", + "514 97.26 0.07486 Malignant 0.288548\n", + "54 97.26 0.05253 Malignant 0.298910" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# take the least 5\n", + "nearest_5 = cancer.nsmallest(5, \"dist_from_new\")[[\n", + " \"perimeter_mean\",\n", + " \"concavity_mean\",\n", + " \"diagnosis\",\n", + " \"dist_from_new\"\n", + "]]\n", + "\n", + "nearest_5" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create mapping between values and colors\n", + "labels = cancer[\"diagnosis\"].unique().tolist()\n", + "colors = list(mcolors.TABLEAU_COLORS.keys())\n", + "color_map = {l: colors[i % len(colors)] for i, l in enumerate(labels)}\n", + "\n", + "# Create a 3D plot\n", + "ax = plt.axes(projection=\"3d\")\n", + "\n", + "# Plot data points with color corresponding to diagnosis\n", + "sc = ax.scatter3D(cancer['perimeter_mean'], cancer['concavity_mean'], cancer['symmetry_mean'], \n", + " c=cancer['diagnosis'].map(color_map), marker='o')\n", + "\n", + "# Add axis labels\n", + "ax.set_xlabel('Perimeter Mean')\n", + "ax.set_ylabel('Concavity Mean')\n", + "ax.set_zlabel('Symmetry Mean')\n", + "ax.set_title('3D Scatter Plot of Perimeter Mean, Concavity Mean, and Symmetry Mean')\n", + "\n", + "# Create custom legend handles\n", + "handles = [plt.Line2D([0], [0], marker='o', color='w', label=label,\n", + " markersize=10, markerfacecolor=color_map[label])\n", + " for label in labels]\n", + "\n", + "# Add legend\n", + "plt.legend(handles=handles, title='Diagnosis')\n", + "\n", + "# Show plot\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create mapping between values and colors\n", + "labels = cancer[\"diagnosis\"].unique().tolist()\n", + "colors = list(mcolors.TABLEAU_COLORS.keys())\n", + "color_map = {l: colors[i % len(colors)] for i, l in enumerate(labels)}\n", + "\n", + "# Create a 3D plot\n", + "ax = plt.axes(projection=\"3d\")\n", + "\n", + "# Plot data points with color corresponding to diagnosis\n", + "sc = ax.scatter3D(cancer['perimeter_mean'], cancer['concavity_mean'], cancer['symmetry_mean'], \n", + " c=cancer['diagnosis'].map(color_map), marker='o')\n", + "\n", + "# Define the new observation\n", + "new_observation = {'perimeter_mean': 97, 'concavity_mean': 0.20, 'symmetry_mean': 0.22}\n", + "\n", + "# Plot the new observation\n", + "ax.scatter3D(new_observation['perimeter_mean'], new_observation['concavity_mean'], \n", + " new_observation['symmetry_mean'], color='red', edgecolor='black', \n", + " s=100, marker='o', label='New Observation')\n", + "\n", + "# Add axis labels\n", + "ax.set_xlabel('Perimeter Mean')\n", + "ax.set_ylabel('Concavity Mean')\n", + "ax.set_zlabel('Symmetry Mean')\n", + "ax.set_title('3D Scatter Plot of Perimeter Mean, Concavity Mean, and Symmetry Mean')\n", + "\n", + "# Create custom legend handles\n", + "handles = [plt.Line2D([0], [0], marker='o', color='w', label=label,\n", + " markersize=10, markerfacecolor=color_map[label])\n", + " for label in labels]\n", + "\n", + "# Add custom legend for new observation\n", + "handles.append(plt.Line2D([0], [0], marker='o', color='red', label='New Observation', \n", + " markersize=10, markeredgecolor='black'))\n", + "\n", + "# Add legend\n", + "plt.legend(handles=handles, title='Diagnosis')\n", + "\n", + "# Show plot\n", + "plt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "new_obs_Perimeter = 97\n", + "new_obs_Concavity = 0.20\n", + "new_obs_Symmetry = 0.22" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 25.800203\n", + "1 35.900199\n", + "2 33.000003\n", + "3 19.420085\n", + "4 38.100020\n", + " ... \n", + "564 45.000046\n", + "565 34.200075\n", + "566 11.300676\n", + "567 43.100270\n", + "568 49.080446\n", + "Name: dist_from_new, Length: 569, dtype: float64" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "##********** **Calculating the distance *******as KNN do ********\n", + "\n", + "cancer[\"dist_from_new\"] = (\n", + " (cancer[\"perimeter_mean\"] - new_obs_Perimeter) ** 2\n", + " + (cancer[\"concavity_mean\"] - new_obs_Concavity) ** 2\n", + " + (cancer[\"symmetry_mean\"] - new_obs_Symmetry) ** 2\n", + ")**(1/2)\n", + "\n", + "cancer[\"dist_from_new\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
perimeter_meanconcavity_meansymmetry_meandiagnosisdist_from_new
29197.030.059400.1879Benign0.147305
13896.850.153900.1957Malignant0.158795
1596.730.163900.2303Malignant0.272597
51497.260.074860.1561Malignant0.295539
5497.260.052530.1616Malignant0.304562
\n", + "
" + ], + "text/plain": [ + " perimeter_mean concavity_mean symmetry_mean diagnosis dist_from_new\n", + "291 97.03 0.05940 0.1879 Benign 0.147305\n", + "138 96.85 0.15390 0.1957 Malignant 0.158795\n", + "15 96.73 0.16390 0.2303 Malignant 0.272597\n", + "514 97.26 0.07486 0.1561 Malignant 0.295539\n", + "54 97.26 0.05253 0.1616 Malignant 0.304562" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nearest_5 = cancer.nsmallest(5, \"dist_from_new\")[[\n", + " \"perimeter_mean\",\n", + " \"concavity_mean\",\n", + " \"symmetry_mean\",\n", + " \"diagnosis\",\n", + " \"dist_from_new\"]]\n", + "\n", + "nearest_5" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import set_config\n", + "set_config (transform_output= \"pandas\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
diagnosisperimeter_meanconcavity_mean
0Malignant122.800.30010
1Malignant132.900.08690
2Malignant130.000.19740
3Malignant77.580.24140
4Malignant135.100.19800
............
564Malignant142.000.24390
565Malignant131.200.14400
566Malignant108.300.09251
567Malignant140.100.35140
568Benign47.920.00000
\n", + "

569 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " diagnosis perimeter_mean concavity_mean\n", + "0 Malignant 122.80 0.30010\n", + "1 Malignant 132.90 0.08690\n", + "2 Malignant 130.00 0.19740\n", + "3 Malignant 77.58 0.24140\n", + "4 Malignant 135.10 0.19800\n", + ".. ... ... ...\n", + "564 Malignant 142.00 0.24390\n", + "565 Malignant 131.20 0.14400\n", + "566 Malignant 108.30 0.09251\n", + "567 Malignant 140.10 0.35140\n", + "568 Benign 47.92 0.00000\n", + "\n", + "[569 rows x 3 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer_train = cancer[[\"diagnosis\", \"perimeter_mean\", \"concavity_mean\"]]\n", + "cancer_train" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier()" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn = KNeighborsClassifier(n_neighbors=5)\n", + "knn" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# define our perditor variable (xs)\n", + "X = cancer_train[[\"perimeter_mean\", \"concavity_mean\"]]\n", + "y = cancer_train[[\"diagnosis\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\tinti\\miniconda3\\envs\\dsi_participant\\lib\\site-packages\\sklearn\\neighbors\\_classification.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " return self._fit(X, y)\n" + ] + }, + { + "data": { + "text/html": [ + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier()" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fit into the knn model\n", + "knn.fit(X,y)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Malignant'], dtype=object)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_obs = pd.DataFrame({\"perimeter_mean\": [97], \"concavity_mean\": [0.20]})\n", + "knn.predict(new_obs)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Malignant'], dtype=object)" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# predict our diagnosis \n", + "knn.predict(new_obs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Jan 8 ####" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#divde data\n", + "# adj paramente\n", + "#tuning\n", + "#play on test" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import recall_score, precision_score\n", + "from sklearn.model_selection import cross_validate\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0842302M17.9910.38122.801001.00.118400.277600.300100.14710...25.38017.33184.602019.00.162200.665600.71190.26540.46010.11890
1842517M20.5717.77132.901326.00.084740.078640.086900.07017...24.99023.41158.801956.00.123800.186600.24160.18600.27500.08902
284300903M19.6921.25130.001203.00.109600.159900.197400.12790...23.57025.53152.501709.00.144400.424500.45040.24300.36130.08758
384348301M11.4220.3877.58386.10.142500.283900.241400.10520...14.91026.5098.87567.70.209800.866300.68690.25750.66380.17300
484358402M20.2914.34135.101297.00.100300.132800.198000.10430...22.54016.67152.201575.00.137400.205000.40000.16250.23640.07678
..................................................................
564926424M21.5622.39142.001479.00.111000.115900.243900.13890...25.45026.40166.102027.00.141000.211300.41070.22160.20600.07115
565926682M20.1328.25131.201261.00.097800.103400.144000.09791...23.69038.25155.001731.00.116600.192200.32150.16280.25720.06637
566926954M16.6028.08108.30858.10.084550.102300.092510.05302...18.98034.12126.701124.00.113900.309400.34030.14180.22180.07820
567927241M20.6029.33140.101265.00.117800.277000.351400.15200...25.74039.42184.601821.00.165000.868100.93870.26500.40870.12400
56892751B7.7624.5447.92181.00.052630.043620.000000.00000...9.45630.3759.16268.60.089960.064440.00000.00000.28710.07039
\n", + "

569 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", + "0 842302 M 17.99 10.38 122.80 1001.0 \n", + "1 842517 M 20.57 17.77 132.90 1326.0 \n", + "2 84300903 M 19.69 21.25 130.00 1203.0 \n", + "3 84348301 M 11.42 20.38 77.58 386.1 \n", + "4 84358402 M 20.29 14.34 135.10 1297.0 \n", + ".. ... ... ... ... ... ... \n", + "564 926424 M 21.56 22.39 142.00 1479.0 \n", + "565 926682 M 20.13 28.25 131.20 1261.0 \n", + "566 926954 M 16.60 28.08 108.30 858.1 \n", + "567 927241 M 20.60 29.33 140.10 1265.0 \n", + "568 92751 B 7.76 24.54 47.92 181.0 \n", + "\n", + " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", + "0 0.11840 0.27760 0.30010 0.14710 \n", + "1 0.08474 0.07864 0.08690 0.07017 \n", + "2 0.10960 0.15990 0.19740 0.12790 \n", + "3 0.14250 0.28390 0.24140 0.10520 \n", + "4 0.10030 0.13280 0.19800 0.10430 \n", + ".. ... ... ... ... \n", + "564 0.11100 0.11590 0.24390 0.13890 \n", + "565 0.09780 0.10340 0.14400 0.09791 \n", + "566 0.08455 0.10230 0.09251 0.05302 \n", + "567 0.11780 0.27700 0.35140 0.15200 \n", + "568 0.05263 0.04362 0.00000 0.00000 \n", + "\n", + " ... radius_worst texture_worst perimeter_worst area_worst \\\n", + "0 ... 25.380 17.33 184.60 2019.0 \n", + "1 ... 24.990 23.41 158.80 1956.0 \n", + "2 ... 23.570 25.53 152.50 1709.0 \n", + "3 ... 14.910 26.50 98.87 567.7 \n", + "4 ... 22.540 16.67 152.20 1575.0 \n", + ".. ... ... ... ... ... \n", + "564 ... 25.450 26.40 166.10 2027.0 \n", + "565 ... 23.690 38.25 155.00 1731.0 \n", + "566 ... 18.980 34.12 126.70 1124.0 \n", + "567 ... 25.740 39.42 184.60 1821.0 \n", + "568 ... 9.456 30.37 59.16 268.6 \n", + "\n", + " smoothness_worst compactness_worst concavity_worst \\\n", + "0 0.16220 0.66560 0.7119 \n", + "1 0.12380 0.18660 0.2416 \n", + "2 0.14440 0.42450 0.4504 \n", + "3 0.20980 0.86630 0.6869 \n", + "4 0.13740 0.20500 0.4000 \n", + ".. ... ... ... \n", + "564 0.14100 0.21130 0.4107 \n", + "565 0.11660 0.19220 0.3215 \n", + "566 0.11390 0.30940 0.3403 \n", + "567 0.16500 0.86810 0.9387 \n", + "568 0.08996 0.06444 0.0000 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + ".. ... ... ... \n", + "564 0.2216 0.2060 0.07115 \n", + "565 0.1628 0.2572 0.06637 \n", + "566 0.1418 0.2218 0.07820 \n", + "567 0.2650 0.4087 0.12400 \n", + "568 0.0000 0.2871 0.07039 \n", + "\n", + "[569 rows x 32 columns]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer = pd.read_csv('dataset/wdbc.csv')\n", + "cancer" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Malignant', 'Benign'], dtype=object)" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# clean data\n", + "cancer[\"diagnosis\"] = cancer[\"diagnosis\"].replace({\n", + " \"M\" : \"Malignant\",\n", + " \"B\" : \"Benign\"\n", + "})\n", + "\n", + "cancer[\"diagnosis\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Start fitting the model ### AW" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "# create a copy of the df\n", + "standardized_cancer = cancer.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "# remove colns that not need to be scale\n", + "columns_to_exclude = ['id', 'diagnosis']" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "# create a df that exclude the response variable and index\n", + "\n", + "columns_to_scale = standardized_cancer.columns.difference(columns_to_exclude)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "# scale the df \n", + "scaler = StandardScaler()\n", + "standardized_cancer[columns_to_scale] = scaler.fit_transform(standardized_cancer[columns_to_scale])" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0842302Malignant1.097064-2.0733351.2699340.9843751.5684663.2835152.6528742.532475...1.886690-1.3592932.3036012.0012371.3076862.6166652.1095262.2960762.7506221.937015
1842517Malignant1.829821-0.3536321.6859551.908708-0.826962-0.487072-0.0238460.548144...1.805927-0.3692031.5351261.890489-0.375612-0.430444-0.1467491.087084-0.2438900.281190
284300903Malignant1.5798880.4561871.5665031.5588840.9422101.0529261.3634782.037231...1.511870-0.0239741.3474751.4562850.5274071.0829320.8549741.9550001.1522550.201391
384348301Malignant-0.7689090.253732-0.592687-0.7644643.2835533.4029091.9158971.451707...-0.2814640.133984-0.249939-0.5500213.3942753.8933971.9895882.1757866.0460414.935010
484358402Malignant1.750297-1.1518161.7765731.8262290.2803720.5393401.3710111.428493...1.298575-1.4667701.3385391.2207240.220556-0.3133950.6131790.729259-0.868353-0.397100
..................................................................
564926424Malignant2.1109950.7214732.0607862.3438561.0418420.2190601.9472852.320965...1.9011850.1177001.7525632.0153010.378365-0.2733180.6645121.629151-1.360158-0.709091
565926682Malignant1.7048542.0851341.6159311.7238420.102458-0.0178330.6930431.263669...1.5367202.0473991.4219401.494959-0.691230-0.3948200.2365730.733827-0.531855-0.973978
566926954Malignant0.7022842.0455740.6726760.577953-0.840484-0.0386800.0465880.105777...0.5613611.3748540.5790010.427906-0.8095870.3507350.3267670.414069-1.104549-0.318409
567927241Malignant1.8383412.3364571.9825241.7352181.5257673.2721443.2969442.658866...1.9612392.2379262.3036011.6531711.4304273.9048483.1976052.2899851.9190832.219635
56892751Benign-1.8084011.221792-1.814389-1.347789-3.112085-1.150752-1.114873-1.261820...-1.4108930.764190-1.432735-1.075813-1.859019-1.207552-1.305831-1.745063-0.048138-0.751207
\n", + "

569 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean \\\n", + "0 842302 Malignant 1.097064 -2.073335 1.269934 \n", + "1 842517 Malignant 1.829821 -0.353632 1.685955 \n", + "2 84300903 Malignant 1.579888 0.456187 1.566503 \n", + "3 84348301 Malignant -0.768909 0.253732 -0.592687 \n", + "4 84358402 Malignant 1.750297 -1.151816 1.776573 \n", + ".. ... ... ... ... ... \n", + "564 926424 Malignant 2.110995 0.721473 2.060786 \n", + "565 926682 Malignant 1.704854 2.085134 1.615931 \n", + "566 926954 Malignant 0.702284 2.045574 0.672676 \n", + "567 927241 Malignant 1.838341 2.336457 1.982524 \n", + "568 92751 Benign -1.808401 1.221792 -1.814389 \n", + "\n", + " area_mean smoothness_mean compactness_mean concavity_mean \\\n", + "0 0.984375 1.568466 3.283515 2.652874 \n", + "1 1.908708 -0.826962 -0.487072 -0.023846 \n", + "2 1.558884 0.942210 1.052926 1.363478 \n", + "3 -0.764464 3.283553 3.402909 1.915897 \n", + "4 1.826229 0.280372 0.539340 1.371011 \n", + ".. ... ... ... ... \n", + "564 2.343856 1.041842 0.219060 1.947285 \n", + "565 1.723842 0.102458 -0.017833 0.693043 \n", + "566 0.577953 -0.840484 -0.038680 0.046588 \n", + "567 1.735218 1.525767 3.272144 3.296944 \n", + "568 -1.347789 -3.112085 -1.150752 -1.114873 \n", + "\n", + " concave points_mean ... radius_worst texture_worst perimeter_worst \\\n", + "0 2.532475 ... 1.886690 -1.359293 2.303601 \n", + "1 0.548144 ... 1.805927 -0.369203 1.535126 \n", + "2 2.037231 ... 1.511870 -0.023974 1.347475 \n", + "3 1.451707 ... -0.281464 0.133984 -0.249939 \n", + "4 1.428493 ... 1.298575 -1.466770 1.338539 \n", + ".. ... ... ... ... ... \n", + "564 2.320965 ... 1.901185 0.117700 1.752563 \n", + "565 1.263669 ... 1.536720 2.047399 1.421940 \n", + "566 0.105777 ... 0.561361 1.374854 0.579001 \n", + "567 2.658866 ... 1.961239 2.237926 2.303601 \n", + "568 -1.261820 ... -1.410893 0.764190 -1.432735 \n", + "\n", + " area_worst smoothness_worst compactness_worst concavity_worst \\\n", + "0 2.001237 1.307686 2.616665 2.109526 \n", + "1 1.890489 -0.375612 -0.430444 -0.146749 \n", + "2 1.456285 0.527407 1.082932 0.854974 \n", + "3 -0.550021 3.394275 3.893397 1.989588 \n", + "4 1.220724 0.220556 -0.313395 0.613179 \n", + ".. ... ... ... ... \n", + "564 2.015301 0.378365 -0.273318 0.664512 \n", + "565 1.494959 -0.691230 -0.394820 0.236573 \n", + "566 0.427906 -0.809587 0.350735 0.326767 \n", + "567 1.653171 1.430427 3.904848 3.197605 \n", + "568 -1.075813 -1.859019 -1.207552 -1.305831 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 2.296076 2.750622 1.937015 \n", + "1 1.087084 -0.243890 0.281190 \n", + "2 1.955000 1.152255 0.201391 \n", + "3 2.175786 6.046041 4.935010 \n", + "4 0.729259 -0.868353 -0.397100 \n", + ".. ... ... ... \n", + "564 1.629151 -1.360158 -0.709091 \n", + "565 0.733827 -0.531855 -0.973978 \n", + "566 0.414069 -1.104549 -0.318409 \n", + "567 2.289985 1.919083 2.219635 \n", + "568 -1.745063 -0.048138 -0.751207 \n", + "\n", + "[569 rows x 32 columns]" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# call again the whole df\n", + "standardized_cancer" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0842302Malignant1.097064-2.0733351.2699340.9843751.5684663.2835152.6528742.532475...1.886690-1.3592932.3036012.0012371.3076862.6166652.1095262.2960762.7506221.937015
1842517Malignant1.829821-0.3536321.6859551.908708-0.826962-0.487072-0.0238460.548144...1.805927-0.3692031.5351261.890489-0.375612-0.430444-0.1467491.087084-0.2438900.281190
284300903Malignant1.5798880.4561871.5665031.5588840.9422101.0529261.3634782.037231...1.511870-0.0239741.3474751.4562850.5274071.0829320.8549741.9550001.1522550.201391
384348301Malignant-0.7689090.253732-0.592687-0.7644643.2835533.4029091.9158971.451707...-0.2814640.133984-0.249939-0.5500213.3942753.8933971.9895882.1757866.0460414.935010
484358402Malignant1.750297-1.1518161.7765731.8262290.2803720.5393401.3710111.428493...1.298575-1.4667701.3385391.2207240.220556-0.3133950.6131790.729259-0.868353-0.397100
..................................................................
564926424Malignant2.1109950.7214732.0607862.3438561.0418420.2190601.9472852.320965...1.9011850.1177001.7525632.0153010.378365-0.2733180.6645121.629151-1.360158-0.709091
565926682Malignant1.7048542.0851341.6159311.7238420.102458-0.0178330.6930431.263669...1.5367202.0473991.4219401.494959-0.691230-0.3948200.2365730.733827-0.531855-0.973978
566926954Malignant0.7022842.0455740.6726760.577953-0.840484-0.0386800.0465880.105777...0.5613611.3748540.5790010.427906-0.8095870.3507350.3267670.414069-1.104549-0.318409
567927241Malignant1.8383412.3364571.9825241.7352181.5257673.2721443.2969442.658866...1.9612392.2379262.3036011.6531711.4304273.9048483.1976052.2899851.9190832.219635
56892751Benign-1.8084011.221792-1.814389-1.347789-3.112085-1.150752-1.114873-1.261820...-1.4108930.764190-1.432735-1.075813-1.859019-1.207552-1.305831-1.745063-0.048138-0.751207
\n", + "

569 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean \\\n", + "0 842302 Malignant 1.097064 -2.073335 1.269934 \n", + "1 842517 Malignant 1.829821 -0.353632 1.685955 \n", + "2 84300903 Malignant 1.579888 0.456187 1.566503 \n", + "3 84348301 Malignant -0.768909 0.253732 -0.592687 \n", + "4 84358402 Malignant 1.750297 -1.151816 1.776573 \n", + ".. ... ... ... ... ... \n", + "564 926424 Malignant 2.110995 0.721473 2.060786 \n", + "565 926682 Malignant 1.704854 2.085134 1.615931 \n", + "566 926954 Malignant 0.702284 2.045574 0.672676 \n", + "567 927241 Malignant 1.838341 2.336457 1.982524 \n", + "568 92751 Benign -1.808401 1.221792 -1.814389 \n", + "\n", + " area_mean smoothness_mean compactness_mean concavity_mean \\\n", + "0 0.984375 1.568466 3.283515 2.652874 \n", + "1 1.908708 -0.826962 -0.487072 -0.023846 \n", + "2 1.558884 0.942210 1.052926 1.363478 \n", + "3 -0.764464 3.283553 3.402909 1.915897 \n", + "4 1.826229 0.280372 0.539340 1.371011 \n", + ".. ... ... ... ... \n", + "564 2.343856 1.041842 0.219060 1.947285 \n", + "565 1.723842 0.102458 -0.017833 0.693043 \n", + "566 0.577953 -0.840484 -0.038680 0.046588 \n", + "567 1.735218 1.525767 3.272144 3.296944 \n", + "568 -1.347789 -3.112085 -1.150752 -1.114873 \n", + "\n", + " concave points_mean ... radius_worst texture_worst perimeter_worst \\\n", + "0 2.532475 ... 1.886690 -1.359293 2.303601 \n", + "1 0.548144 ... 1.805927 -0.369203 1.535126 \n", + "2 2.037231 ... 1.511870 -0.023974 1.347475 \n", + "3 1.451707 ... -0.281464 0.133984 -0.249939 \n", + "4 1.428493 ... 1.298575 -1.466770 1.338539 \n", + ".. ... ... ... ... ... \n", + "564 2.320965 ... 1.901185 0.117700 1.752563 \n", + "565 1.263669 ... 1.536720 2.047399 1.421940 \n", + "566 0.105777 ... 0.561361 1.374854 0.579001 \n", + "567 2.658866 ... 1.961239 2.237926 2.303601 \n", + "568 -1.261820 ... -1.410893 0.764190 -1.432735 \n", + "\n", + " area_worst smoothness_worst compactness_worst concavity_worst \\\n", + "0 2.001237 1.307686 2.616665 2.109526 \n", + "1 1.890489 -0.375612 -0.430444 -0.146749 \n", + "2 1.456285 0.527407 1.082932 0.854974 \n", + "3 -0.550021 3.394275 3.893397 1.989588 \n", + "4 1.220724 0.220556 -0.313395 0.613179 \n", + ".. ... ... ... ... \n", + "564 2.015301 0.378365 -0.273318 0.664512 \n", + "565 1.494959 -0.691230 -0.394820 0.236573 \n", + "566 0.427906 -0.809587 0.350735 0.326767 \n", + "567 1.653171 1.430427 3.904848 3.197605 \n", + "568 -1.075813 -1.859019 -1.207552 -1.305831 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 2.296076 2.750622 1.937015 \n", + "1 1.087084 -0.243890 0.281190 \n", + "2 1.955000 1.152255 0.201391 \n", + "3 2.175786 6.046041 4.935010 \n", + "4 0.729259 -0.868353 -0.397100 \n", + ".. ... ... ... \n", + "564 1.629151 -1.360158 -0.709091 \n", + "565 0.733827 -0.531855 -0.973978 \n", + "566 0.414069 -1.104549 -0.318409 \n", + "567 2.289985 1.919083 2.219635 \n", + "568 -1.745063 -0.048138 -0.751207 \n", + "\n", + "[569 rows x 32 columns]" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "standardized_cancer = cancer.copy()\n", + "\n", + "columns_to_exclude = ['id', 'diagnosis']\n", + "\n", + "columns_to_scale = standardized_cancer.columns.difference(columns_to_exclude)\n", + "\n", + "scaler = StandardScaler()\n", + "standardized_cancer[columns_to_scale] = scaler.fit_transform(standardized_cancer[columns_to_scale])\n", + "standardized_cancer" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "#np.random.seed(1)\n", + "\n", + "cancer_train, cancer_test = train_test_split(\n", + " standardized_cancer, train_size=0.75, shuffle= True,\n", + " stratify=standardized_cancer[\"diagnosis\"], random_state= 123\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 143 entries, 257 to 200\n", + "Data columns (total 32 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 143 non-null int64 \n", + " 1 diagnosis 143 non-null object \n", + " 2 radius_mean 143 non-null float64\n", + " 3 texture_mean 143 non-null float64\n", + " 4 perimeter_mean 143 non-null float64\n", + " 5 area_mean 143 non-null float64\n", + " 6 smoothness_mean 143 non-null float64\n", + " 7 compactness_mean 143 non-null float64\n", + " 8 concavity_mean 143 non-null float64\n", + " 9 concave points_mean 143 non-null float64\n", + " 10 symmetry_mean 143 non-null float64\n", + " 11 fractal_dimension_mean 143 non-null float64\n", + " 12 radius_se 143 non-null float64\n", + " 13 texture_se 143 non-null float64\n", + " 14 perimeter_se 143 non-null float64\n", + " 15 area_se 143 non-null float64\n", + " 16 smoothness_se 143 non-null float64\n", + " 17 compactness_se 143 non-null float64\n", + " 18 concavity_se 143 non-null float64\n", + " 19 concave points_se 143 non-null float64\n", + " 20 symmetry_se 143 non-null float64\n", + " 21 fractal_dimension_se 143 non-null float64\n", + " 22 radius_worst 143 non-null float64\n", + " 23 texture_worst 143 non-null float64\n", + " 24 perimeter_worst 143 non-null float64\n", + " 25 area_worst 143 non-null float64\n", + " 26 smoothness_worst 143 non-null float64\n", + " 27 compactness_worst 143 non-null float64\n", + " 28 concavity_worst 143 non-null float64\n", + " 29 concave points_worst 143 non-null float64\n", + " 30 symmetry_worst 143 non-null float64\n", + " 31 fractal_dimension_worst 143 non-null float64\n", + "dtypes: float64(30), int64(1), object(1)\n", + "memory usage: 36.9+ KB\n" + ] + } + ], + "source": [ + "#cancer_train.info()\n", + "cancer_test.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "knn = KNeighborsClassifier(n_neighbors=5\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "X = cancer_train[['perimeter_mean','concavity_mean']]\n", + "y = cancer_train ['diagnosis']" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier()" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn.fit(X,y)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosispredicted
257886776MalignantMalignant
38290250BenignBenign
241883539BenignBenign
52791813702BenignBenign
3689011971MalignantMalignant
............
208510653BenignBenign
247884626BenignMalignant
29853201MalignantMalignant
17787281702MalignantMalignant
200877501BenignBenign
\n", + "

143 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis predicted\n", + "257 886776 Malignant Malignant\n", + "382 90250 Benign Benign\n", + "241 883539 Benign Benign\n", + "527 91813702 Benign Benign\n", + "368 9011971 Malignant Malignant\n", + ".. ... ... ...\n", + "20 8510653 Benign Benign\n", + "247 884626 Benign Malignant\n", + "29 853201 Malignant Malignant\n", + "177 87281702 Malignant Malignant\n", + "200 877501 Benign Benign\n", + "\n", + "[143 rows x 3 columns]" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer_test[\"predicted\"] = knn.predict(cancer_test[[\"perimeter_mean\", \"concavity_mean\"]])\n", + "cancer_test[[\"id\", \"diagnosis\", \"predicted\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9230769230769231" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn.score(\n", + " cancer_test[[\"perimeter_mean\", \"concavity_mean\"]],\n", + " cancer_test[\"diagnosis\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PredictedBenignMalignant
Actual
Benign846
Malignant548
\n", + "
" + ], + "text/plain": [ + "Predicted Benign Malignant\n", + "Actual \n", + "Benign 84 6\n", + "Malignant 5 48" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.crosstab(\n", + " cancer_test[\"diagnosis\"],\n", + " cancer_test[\"predicted\"],\n", + " rownames = ['Actual'],\n", + " colnames = ['Predicted']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8888888888888888" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "precision_score(\n", + " y_true=cancer_test[\"diagnosis\"],\n", + " y_pred=cancer_test[\"predicted\"],\n", + " pos_label=\"Malignant\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9056603773584906" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recall_score(\n", + " y_true=cancer_test[\"diagnosis\"],\n", + " y_pred=cancer_test[\"predicted\"],\n", + " pos_label=\"Malignant\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier(n_neighbors=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier(n_neighbors=3)" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer_subtrain, cancer_validation = train_test_split(\n", + " cancer_train, train_size=0.75, stratify=cancer_train[\"diagnosis\"]\n", + ")\n", + "\n", + "# fit the model on the sub-training data\n", + "knn = KNeighborsClassifier(n_neighbors=3)\n", + "X = cancer_subtrain[[\"perimeter_mean\", \"concavity_mean\"]]\n", + "y = cancer_subtrain[\"diagnosis\"]\n", + "knn.fit(X, y)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8785046728971962" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "acc = knn.score(\n", + " cancer_validation[[\"perimeter_mean\", \"concavity_mean\"]],\n", + " cancer_validation[\"diagnosis\"]\n", + ")\n", + "acc" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fit_timescore_timetest_score
00.0095190.0075060.906977
10.0045150.0075290.905882
20.0020060.0102540.929412
30.0055200.0110330.894118
40.0030000.0097660.894118
\n", + "
" + ], + "text/plain": [ + " fit_time score_time test_score\n", + "0 0.009519 0.007506 0.906977\n", + "1 0.004515 0.007529 0.905882\n", + "2 0.002006 0.010254 0.929412\n", + "3 0.005520 0.011033 0.894118\n", + "4 0.003000 0.009766 0.894118" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn = KNeighborsClassifier(n_neighbors=3)\n", + "X = cancer_train[[\"perimeter_mean\", \"concavity_mean\"]]\n", + "y = cancer_train[\"diagnosis\"]\n", + "\n", + "returned_dictionary = cross_validate(\n", + " estimator=knn,\n", + " cv=5, # setting up the cross validation number\n", + " X=X,\n", + " y=y\n", + ")\n", + "\n", + "cv_5_df = pd.DataFrame(returned_dictionary) # Converting it to pandas DataFrame\n", + "\n", + "cv_5_df" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fit_timescore_timetest_score
mean0.0049120.0092180.906101
sem0.0013010.0007230.006448
\n", + "
" + ], + "text/plain": [ + " fit_time score_time test_score\n", + "mean 0.004912 0.009218 0.906101\n", + "sem 0.001301 0.000723 0.006448" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_5_metrics = cv_5_df.agg([\"mean\", \"sem\"])\n", + "\n", + "cv_5_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "parameter_grid = {\n", + " \"n_neighbors\": range(1, 100, 5),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "cancer_tune_grid = GridSearchCV(\n", + " estimator=knn,\n", + " param_grid=parameter_grid,\n", + " cv=10\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=10, estimator=KNeighborsClassifier(n_neighbors=3),\n",
+       "             param_grid={'n_neighbors': range(1, 100, 5)})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=10, estimator=KNeighborsClassifier(n_neighbors=3),\n", + " param_grid={'n_neighbors': range(1, 100, 5)})" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer_tune_grid.fit(\n", + " cancer_train[[\"perimeter_mean\", \"concavity_mean\"]],\n", + " cancer_train[\"diagnosis\"]\n", + ")\n", + "\n", + "#accuracies_grid = pd.DataFrame(cancer_tune_grid.cv_results_)\n", + "#accuracies_grid" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_fit_timestd_fit_timemean_score_timestd_score_timeparam_n_neighborsparamssplit0_test_scoresplit1_test_scoresplit2_test_scoresplit3_test_scoresplit4_test_scoresplit5_test_scoresplit6_test_scoresplit7_test_scoresplit8_test_scoresplit9_test_scoremean_test_scorestd_test_scorerank_test_score
00.0022270.0007480.0053100.0013411{'n_neighbors': 1}1.0000000.8604650.9534880.8837210.8837210.9302330.8571430.8571430.8809520.9047620.9011630.04462520
10.0029660.0008810.0043590.0007456{'n_neighbors': 6}0.9302330.9069770.9534880.9534880.9767440.9069770.9047620.8571430.8809520.9285710.9199340.0341885
20.0026590.0010080.0051640.00118411{'n_neighbors': 11}0.9534880.8837210.9302330.9302330.9302330.9302330.9047620.8571430.8571430.9047620.9081950.03135218
30.0021570.0003820.0048820.00150416{'n_neighbors': 16}0.9534880.8604650.9534880.9534880.9534880.9069770.8809520.8571430.8571430.9285710.9105200.04099015
40.0022780.0007060.0050060.00124021{'n_neighbors': 21}0.9534880.8837210.9534880.9302330.9534880.9069770.8809520.8571430.8571430.9285710.9105200.03681915
50.0023580.0003960.0047520.00124726{'n_neighbors': 26}0.9534880.8604650.9534880.9534880.9534880.9069770.8809520.8571430.8571430.9285710.9105200.04099015
60.0024050.0004530.0045780.00146231{'n_neighbors': 31}0.9534880.8604650.9534880.9534880.9534880.9069770.8809520.8571430.8333330.9285710.9081400.04455719
70.0026150.0009590.0049630.00107636{'n_neighbors': 36}0.9534880.8604650.9534880.9534880.9534880.9069770.9285710.8571430.8333330.9285710.9129010.04393814
80.0025400.0005540.0050020.00101041{'n_neighbors': 41}0.9534880.8837210.9534880.9534880.9534880.9069770.9285710.8571430.8333330.9285710.9152270.04165713
90.0025530.0007500.0040710.00075646{'n_neighbors': 46}0.9534880.8837210.9534880.9534880.9534880.9069770.9285710.8571430.8571430.9285710.9176080.0373686
100.0025220.0006530.0041510.00077151{'n_neighbors': 51}0.9534880.8837210.9534880.9534880.9534880.9069770.9047620.8571430.8809520.9285710.9176080.0341996
110.0025260.0010190.0048130.00095556{'n_neighbors': 56}0.9534880.8837210.9534880.9302330.9534880.9069770.9047620.8571430.8809520.9285710.9152820.03242512
120.0021910.0007490.0048540.00101661{'n_neighbors': 61}0.9534880.8837210.9534880.9534880.9534880.9069770.9047620.8571430.8809520.9285710.9176080.0341996
130.0027470.0013660.0050070.00148066{'n_neighbors': 66}0.9534880.8837210.9534880.9534880.9534880.9069770.9047620.8571430.8809520.9285710.9176080.0341996
140.0026150.0009830.0043570.00059771{'n_neighbors': 71}0.9534880.8837210.9534880.9534880.9534880.9069770.9047620.8571430.8809520.9285710.9176080.0341996
150.0020960.0006320.0050870.00147976{'n_neighbors': 76}0.9534880.8837210.9534880.9534880.9534880.9069770.9047620.8571430.8809520.9285710.9176080.0341996
160.0025620.0009850.0045760.00062881{'n_neighbors': 81}0.9534880.9069770.9534880.9534880.9534880.9069770.9285710.8571430.8809520.9523810.9246950.0334701
170.0022400.0004460.0042720.00047686{'n_neighbors': 86}0.9534880.9069770.9534880.9534880.9534880.9069770.9285710.8571430.8809520.9523810.9246950.0334701
180.0022740.0004390.0045370.00062291{'n_neighbors': 91}0.9534880.9069770.9534880.9534880.9534880.9069770.9285710.8333330.8809520.9523810.9223150.0386393
190.0019230.0004000.0045050.00064896{'n_neighbors': 96}0.9534880.9069770.9534880.9534880.9534880.9069770.9285710.8333330.8809520.9523810.9223150.0386393
\n", + "
" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", + "0 0.002227 0.000748 0.005310 0.001341 \n", + "1 0.002966 0.000881 0.004359 0.000745 \n", + "2 0.002659 0.001008 0.005164 0.001184 \n", + "3 0.002157 0.000382 0.004882 0.001504 \n", + "4 0.002278 0.000706 0.005006 0.001240 \n", + "5 0.002358 0.000396 0.004752 0.001247 \n", + "6 0.002405 0.000453 0.004578 0.001462 \n", + "7 0.002615 0.000959 0.004963 0.001076 \n", + "8 0.002540 0.000554 0.005002 0.001010 \n", + "9 0.002553 0.000750 0.004071 0.000756 \n", + "10 0.002522 0.000653 0.004151 0.000771 \n", + "11 0.002526 0.001019 0.004813 0.000955 \n", + "12 0.002191 0.000749 0.004854 0.001016 \n", + "13 0.002747 0.001366 0.005007 0.001480 \n", + "14 0.002615 0.000983 0.004357 0.000597 \n", + "15 0.002096 0.000632 0.005087 0.001479 \n", + "16 0.002562 0.000985 0.004576 0.000628 \n", + "17 0.002240 0.000446 0.004272 0.000476 \n", + "18 0.002274 0.000439 0.004537 0.000622 \n", + "19 0.001923 0.000400 0.004505 0.000648 \n", + "\n", + " param_n_neighbors params split0_test_score \\\n", + "0 1 {'n_neighbors': 1} 1.000000 \n", + "1 6 {'n_neighbors': 6} 0.930233 \n", + "2 11 {'n_neighbors': 11} 0.953488 \n", + "3 16 {'n_neighbors': 16} 0.953488 \n", + "4 21 {'n_neighbors': 21} 0.953488 \n", + "5 26 {'n_neighbors': 26} 0.953488 \n", + "6 31 {'n_neighbors': 31} 0.953488 \n", + "7 36 {'n_neighbors': 36} 0.953488 \n", + "8 41 {'n_neighbors': 41} 0.953488 \n", + "9 46 {'n_neighbors': 46} 0.953488 \n", + "10 51 {'n_neighbors': 51} 0.953488 \n", + "11 56 {'n_neighbors': 56} 0.953488 \n", + "12 61 {'n_neighbors': 61} 0.953488 \n", + "13 66 {'n_neighbors': 66} 0.953488 \n", + "14 71 {'n_neighbors': 71} 0.953488 \n", + "15 76 {'n_neighbors': 76} 0.953488 \n", + "16 81 {'n_neighbors': 81} 0.953488 \n", + "17 86 {'n_neighbors': 86} 0.953488 \n", + "18 91 {'n_neighbors': 91} 0.953488 \n", + "19 96 {'n_neighbors': 96} 0.953488 \n", + "\n", + " split1_test_score split2_test_score split3_test_score \\\n", + "0 0.860465 0.953488 0.883721 \n", + "1 0.906977 0.953488 0.953488 \n", + "2 0.883721 0.930233 0.930233 \n", + "3 0.860465 0.953488 0.953488 \n", + "4 0.883721 0.953488 0.930233 \n", + "5 0.860465 0.953488 0.953488 \n", + "6 0.860465 0.953488 0.953488 \n", + "7 0.860465 0.953488 0.953488 \n", + "8 0.883721 0.953488 0.953488 \n", + "9 0.883721 0.953488 0.953488 \n", + "10 0.883721 0.953488 0.953488 \n", + "11 0.883721 0.953488 0.930233 \n", + "12 0.883721 0.953488 0.953488 \n", + "13 0.883721 0.953488 0.953488 \n", + "14 0.883721 0.953488 0.953488 \n", + "15 0.883721 0.953488 0.953488 \n", + "16 0.906977 0.953488 0.953488 \n", + "17 0.906977 0.953488 0.953488 \n", + "18 0.906977 0.953488 0.953488 \n", + "19 0.906977 0.953488 0.953488 \n", + "\n", + " split4_test_score split5_test_score split6_test_score \\\n", + "0 0.883721 0.930233 0.857143 \n", + "1 0.976744 0.906977 0.904762 \n", + "2 0.930233 0.930233 0.904762 \n", + "3 0.953488 0.906977 0.880952 \n", + "4 0.953488 0.906977 0.880952 \n", + "5 0.953488 0.906977 0.880952 \n", + "6 0.953488 0.906977 0.880952 \n", + "7 0.953488 0.906977 0.928571 \n", + "8 0.953488 0.906977 0.928571 \n", + "9 0.953488 0.906977 0.928571 \n", + "10 0.953488 0.906977 0.904762 \n", + "11 0.953488 0.906977 0.904762 \n", + "12 0.953488 0.906977 0.904762 \n", + "13 0.953488 0.906977 0.904762 \n", + "14 0.953488 0.906977 0.904762 \n", + "15 0.953488 0.906977 0.904762 \n", + "16 0.953488 0.906977 0.928571 \n", + "17 0.953488 0.906977 0.928571 \n", + "18 0.953488 0.906977 0.928571 \n", + "19 0.953488 0.906977 0.928571 \n", + "\n", + " split7_test_score split8_test_score split9_test_score mean_test_score \\\n", + "0 0.857143 0.880952 0.904762 0.901163 \n", + "1 0.857143 0.880952 0.928571 0.919934 \n", + "2 0.857143 0.857143 0.904762 0.908195 \n", + "3 0.857143 0.857143 0.928571 0.910520 \n", + "4 0.857143 0.857143 0.928571 0.910520 \n", + "5 0.857143 0.857143 0.928571 0.910520 \n", + "6 0.857143 0.833333 0.928571 0.908140 \n", + "7 0.857143 0.833333 0.928571 0.912901 \n", + "8 0.857143 0.833333 0.928571 0.915227 \n", + "9 0.857143 0.857143 0.928571 0.917608 \n", + "10 0.857143 0.880952 0.928571 0.917608 \n", + "11 0.857143 0.880952 0.928571 0.915282 \n", + "12 0.857143 0.880952 0.928571 0.917608 \n", + "13 0.857143 0.880952 0.928571 0.917608 \n", + "14 0.857143 0.880952 0.928571 0.917608 \n", + "15 0.857143 0.880952 0.928571 0.917608 \n", + "16 0.857143 0.880952 0.952381 0.924695 \n", + "17 0.857143 0.880952 0.952381 0.924695 \n", + "18 0.833333 0.880952 0.952381 0.922315 \n", + "19 0.833333 0.880952 0.952381 0.922315 \n", + "\n", + " std_test_score rank_test_score \n", + "0 0.044625 20 \n", + "1 0.034188 5 \n", + "2 0.031352 18 \n", + "3 0.040990 15 \n", + "4 0.036819 15 \n", + "5 0.040990 15 \n", + "6 0.044557 19 \n", + "7 0.043938 14 \n", + "8 0.041657 13 \n", + "9 0.037368 6 \n", + "10 0.034199 6 \n", + "11 0.032425 12 \n", + "12 0.034199 6 \n", + "13 0.034199 6 \n", + "14 0.034199 6 \n", + "15 0.034199 6 \n", + "16 0.033470 1 \n", + "17 0.033470 1 \n", + "18 0.038639 3 \n", + "19 0.038639 3 " + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracies_grid = pd.DataFrame(cancer_tune_grid.cv_results_)\n", + "accuracies_grid" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create the plot\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "# Plot mean test scores with error bars\n", + "plt.plot(accuracies_grid['param_n_neighbors'], accuracies_grid['mean_test_score'], '-o', color='blue')\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Number of Neighbors')\n", + "plt.ylabel('Accuracy estimate')\n", + "plt.title('K-Nearest Neighbors Performance')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_neighbors': 81}" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer_tune_grid.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "large_param_grid = {\n", + " \"n_neighbors\": range(1, 385, 10),\n", + "}\n", + "\n", + "large_cancer_tune_grid = GridSearchCV(\n", + " estimator=knn,\n", + " param_grid=large_param_grid,\n", + " cv=10\n", + ")\n", + "\n", + "large_cancer_tune_grid.fit(\n", + " cancer_train[[\"perimeter_mean\", \"concavity_mean\"]],\n", + " cancer_train[\"diagnosis\"]\n", + ")\n", + "\n", + "large_accuracies_grid = pd.DataFrame(large_cancer_tune_grid.cv_results_)\n", + "\n", + "# Create the plot\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "# Plot mean test scores with error bars\n", + "plt.plot(large_accuracies_grid['param_n_neighbors'], large_accuracies_grid['mean_test_score'], '-o', color='blue')\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Number of Neighbors')\n", + "plt.ylabel('Accuracy estimate')\n", + "plt.title('K-Nearest Neighbors Performance')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#split the data \n", + "# grid\n", + "grid \n", + "\n", + "K achieve higheest\n", + "using score method" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import GridSearchCV, train_test_split\n", + "from sklearn.compose import make_column_transformer\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn import set_config\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.metrics import mean_squared_error" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import set_config\n", + "set_config(transform_output=\"pandas\")" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
streetcityzipstatebedsbathssq__fttypesale_datepricelatitudelongitude
01 KENNELFORD CIRSACRAMENTO95823CA321144ResidentialMon May 19 00:00:00 EDT 200820034538.464520-121.427606
110 SEA FOAM CTSACRAMENTO95831CA332052ResidentialWed May 21 00:00:00 EDT 200841500038.487885-121.545947
2100 CHELSEA CTFOLSOM95630CA321905ResidentialMon May 19 00:00:00 EDT 200850000038.694350-121.177259
3100 REBECCA WAYFOLSOM95630CA322185ResidentialWed May 21 00:00:00 EDT 200834425038.684790-121.149199
4100 TOURMALINE CIRSACRAMENTO95834CA533076ResidentialMon May 19 00:00:00 EDT 200824000038.634370-121.510779
.......................................
8099880 IZILDA CTSACRAMENTO95829CA543863ResidentialFri May 16 00:00:00 EDT 200859869538.453260-121.325730
810993 MANTON CTGALT95632CA432307ResidentialTue May 20 00:00:00 EDT 200830000038.272942-121.289148
8119937 BURLINE STSACRAMENTO95827CA321092ResidentialFri May 16 00:00:00 EDT 200815000038.559641-121.323160
8129949 NESTLING CIRELK GROVE95757CA321543ResidentialFri May 16 00:00:00 EDT 200827500038.397455-121.468391
8139970 STATE HIGHWAY 193PLACERVILLE95667CA431929ResidentialTue May 20 00:00:00 EDT 200848500038.787877-120.816676
\n", + "

814 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " street city zip state beds baths sq__ft \\\n", + "0 1 KENNELFORD CIR SACRAMENTO 95823 CA 3 2 1144 \n", + "1 10 SEA FOAM CT SACRAMENTO 95831 CA 3 3 2052 \n", + "2 100 CHELSEA CT FOLSOM 95630 CA 3 2 1905 \n", + "3 100 REBECCA WAY FOLSOM 95630 CA 3 2 2185 \n", + "4 100 TOURMALINE CIR SACRAMENTO 95834 CA 5 3 3076 \n", + ".. ... ... ... ... ... ... ... \n", + "809 9880 IZILDA CT SACRAMENTO 95829 CA 5 4 3863 \n", + "810 993 MANTON CT GALT 95632 CA 4 3 2307 \n", + "811 9937 BURLINE ST SACRAMENTO 95827 CA 3 2 1092 \n", + "812 9949 NESTLING CIR ELK GROVE 95757 CA 3 2 1543 \n", + "813 9970 STATE HIGHWAY 193 PLACERVILLE 95667 CA 4 3 1929 \n", + "\n", + " type sale_date price latitude longitude \n", + "0 Residential Mon May 19 00:00:00 EDT 2008 200345 38.464520 -121.427606 \n", + "1 Residential Wed May 21 00:00:00 EDT 2008 415000 38.487885 -121.545947 \n", + "2 Residential Mon May 19 00:00:00 EDT 2008 500000 38.694350 -121.177259 \n", + "3 Residential Wed May 21 00:00:00 EDT 2008 344250 38.684790 -121.149199 \n", + "4 Residential Mon May 19 00:00:00 EDT 2008 240000 38.634370 -121.510779 \n", + ".. ... ... ... ... ... \n", + "809 Residential Fri May 16 00:00:00 EDT 2008 598695 38.453260 -121.325730 \n", + "810 Residential Tue May 20 00:00:00 EDT 2008 300000 38.272942 -121.289148 \n", + "811 Residential Fri May 16 00:00:00 EDT 2008 150000 38.559641 -121.323160 \n", + "812 Residential Fri May 16 00:00:00 EDT 2008 275000 38.397455 -121.468391 \n", + "813 Residential Tue May 20 00:00:00 EDT 2008 485000 38.787877 -120.816676 \n", + "\n", + "[814 rows x 12 columns]" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sacramento = pd.read_csv(\"dataset/sacramento.csv\")\n", + "sacramento" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot\n", + "plt.scatter(sacramento[\"sq__ft\"], sacramento['price'])\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel(\"House size (square feet)\")\n", + "plt.ylabel('Price (USD)')\n", + "plt.title('Scatter Plot of House size vs Price')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "# don't need to deal with outliner\n", + "# price can be affect the house size\n", + "# taking average of their value\n", + "# predict the price of house" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
streetcityzipstatebedsbathssq__fttypesale_datepricelatitudelongitude
1852610 PHYLLIS AVESACRAMENTO95820CA21804ResidentialMon May 19 00:00:00 EDT 200812000038.531050-121.479574
210294 SPARROW DRGALT95632CA432214ResidentialFri May 16 00:00:00 EDT 200827800038.258976-121.321266
4876000 BIRCHGLADE WAYCITRUS HEIGHTS95621CA421351ResidentialMon May 19 00:00:00 EDT 200815800038.701660-121.323249
267361 MAHONIA CIRSACRAMENTO95835CA432175ResidentialMon May 19 00:00:00 EDT 200826100038.676172-121.509761
6748164 CHENIN BLANC LNFAIR OAKS95628CA221315ResidentialTue May 20 00:00:00 EDT 200823000038.665644-121.259969
177251 CHANGO CIRSACRAMENTO95835CA422218ResidentialMon May 19 00:00:00 EDT 200831132838.682370-121.539147
217301 OLIVADI WAYSACRAMENTO95834CA221250CondoMon May 19 00:00:00 EDT 200823250038.644406-121.549049
5897204 THOMAS DRNORTH HIGHLANDS95660CA321152ResidentialMon May 19 00:00:00 EDT 200815800038.697898-121.377687
6968306 CURLEW CTCITRUS HEIGHTS95621CA421280ResidentialMon May 19 00:00:00 EDT 200816729338.715781-121.298519
6027349 FLETCHER FARM DRSACRAMENTO95828CA421587ResidentialMon May 19 00:00:00 EDT 200812750038.490690-121.382619
229312 RIVER ISLE WAYSACRAMENTO95831CA321375ResidentialMon May 19 00:00:00 EDT 200823200038.490260-121.550527
4565651 OVERLEAF WAYSACRAMENTO95835CA421910ResidentialTue May 20 00:00:00 EDT 200830050038.677454-121.494791
5266344 LAGUNA MIRAGE LNELK GROVE95758CA221112ResidentialMon May 19 00:00:00 EDT 200821369738.423963-121.428875
531648 SANTA ANA AVESACRAMENTO95838CA321211ResidentialTue May 20 00:00:00 EDT 200813500038.658478-121.450409
2523409 VIRGO STSACRAMENTO95827CA321320ResidentialFri May 16 00:00:00 EDT 200811550038.563402-121.327747
2273108 DELWOOD WAYSACRAMENTO95821CA422053ResidentialFri May 16 00:00:00 EDT 200845000038.621566-121.370882
5676943 WOLFGRAM WAYSACRAMENTO95828CA421176ResidentialMon May 19 00:00:00 EDT 200824723438.489215-121.419546
1622361 LA LOMA DRRANCHO CORDOVA95670CA321115ResidentialFri May 16 00:00:00 EDT 200811600038.593680-121.316054
2403240 S STSACRAMENTO95816CA211269ResidentialTue May 20 00:00:00 EDT 200824500038.562296-121.467489
751401 STERLING STSACRAMENTO95822CA21810ResidentialFri May 16 00:00:00 EDT 200810800038.520319-121.504727
3884901 MILLNER WAYELK GROVE95757CA321843ResidentialWed May 21 00:00:00 EDT 200825420038.386920-121.447349
6578 LA ROCAS CTSACRAMENTO95823CA321217ResidentialThu May 15 00:00:00 EDT 200815108738.466160-121.448283
2793729 BAINBRIDGE DRNORTH HIGHLANDS95660CA32901ResidentialWed May 21 00:00:00 EDT 200812500038.701499-121.376220
6017344 BUTTERBALL WAYSACRAMENTO95842CA321503ResidentialTue May 20 00:00:00 EDT 200824500038.699489-121.361828
7889452 RED SPRUCE WAYELK GROVE95624CA632555ResidentialFri May 16 00:00:00 EDT 200830000038.404505-121.346938
3034008 GREY LIVERY WAYANTELOPE95843CA321669ResidentialFri May 16 00:00:00 EDT 200816875038.718460-121.370862
2213035 BRUNNET LNSACRAMENTO95833CA321522ResidentialTue May 20 00:00:00 EDT 200822500038.624762-121.522775
4100 TOURMALINE CIRSACRAMENTO95834CA533076ResidentialMon May 19 00:00:00 EDT 200824000038.634370-121.510779
2032901 PINTAIL WAYELK GROVE95757CA433070ResidentialTue May 20 00:00:00 EDT 200849500038.398488-121.473424
2733692 PAYNE WAYNORTH HIGHLANDS95660CA31957ResidentialTue May 20 00:00:00 EDT 200812900038.666540-121.378298
\n", + "
" + ], + "text/plain": [ + " street city zip state beds baths sq__ft \\\n", + "185 2610 PHYLLIS AVE SACRAMENTO 95820 CA 2 1 804 \n", + "210 294 SPARROW DR GALT 95632 CA 4 3 2214 \n", + "487 6000 BIRCHGLADE WAY CITRUS HEIGHTS 95621 CA 4 2 1351 \n", + "267 361 MAHONIA CIR SACRAMENTO 95835 CA 4 3 2175 \n", + "674 8164 CHENIN BLANC LN FAIR OAKS 95628 CA 2 2 1315 \n", + "177 251 CHANGO CIR SACRAMENTO 95835 CA 4 2 2218 \n", + "217 301 OLIVADI WAY SACRAMENTO 95834 CA 2 2 1250 \n", + "589 7204 THOMAS DR NORTH HIGHLANDS 95660 CA 3 2 1152 \n", + "696 8306 CURLEW CT CITRUS HEIGHTS 95621 CA 4 2 1280 \n", + "602 7349 FLETCHER FARM DR SACRAMENTO 95828 CA 4 2 1587 \n", + "229 312 RIVER ISLE WAY SACRAMENTO 95831 CA 3 2 1375 \n", + "456 5651 OVERLEAF WAY SACRAMENTO 95835 CA 4 2 1910 \n", + "526 6344 LAGUNA MIRAGE LN ELK GROVE 95758 CA 2 2 1112 \n", + "531 648 SANTA ANA AVE SACRAMENTO 95838 CA 3 2 1211 \n", + "252 3409 VIRGO ST SACRAMENTO 95827 CA 3 2 1320 \n", + "227 3108 DELWOOD WAY SACRAMENTO 95821 CA 4 2 2053 \n", + "567 6943 WOLFGRAM WAY SACRAMENTO 95828 CA 4 2 1176 \n", + "162 2361 LA LOMA DR RANCHO CORDOVA 95670 CA 3 2 1115 \n", + "240 3240 S ST SACRAMENTO 95816 CA 2 1 1269 \n", + "75 1401 STERLING ST SACRAMENTO 95822 CA 2 1 810 \n", + "388 4901 MILLNER WAY ELK GROVE 95757 CA 3 2 1843 \n", + "657 8 LA ROCAS CT SACRAMENTO 95823 CA 3 2 1217 \n", + "279 3729 BAINBRIDGE DR NORTH HIGHLANDS 95660 CA 3 2 901 \n", + "601 7344 BUTTERBALL WAY SACRAMENTO 95842 CA 3 2 1503 \n", + "788 9452 RED SPRUCE WAY ELK GROVE 95624 CA 6 3 2555 \n", + "303 4008 GREY LIVERY WAY ANTELOPE 95843 CA 3 2 1669 \n", + "221 3035 BRUNNET LN SACRAMENTO 95833 CA 3 2 1522 \n", + "4 100 TOURMALINE CIR SACRAMENTO 95834 CA 5 3 3076 \n", + "203 2901 PINTAIL WAY ELK GROVE 95757 CA 4 3 3070 \n", + "273 3692 PAYNE WAY NORTH HIGHLANDS 95660 CA 3 1 957 \n", + "\n", + " type sale_date price latitude longitude \n", + "185 Residential Mon May 19 00:00:00 EDT 2008 120000 38.531050 -121.479574 \n", + "210 Residential Fri May 16 00:00:00 EDT 2008 278000 38.258976 -121.321266 \n", + "487 Residential Mon May 19 00:00:00 EDT 2008 158000 38.701660 -121.323249 \n", + "267 Residential Mon May 19 00:00:00 EDT 2008 261000 38.676172 -121.509761 \n", + "674 Residential Tue May 20 00:00:00 EDT 2008 230000 38.665644 -121.259969 \n", + "177 Residential Mon May 19 00:00:00 EDT 2008 311328 38.682370 -121.539147 \n", + "217 Condo Mon May 19 00:00:00 EDT 2008 232500 38.644406 -121.549049 \n", + "589 Residential Mon May 19 00:00:00 EDT 2008 158000 38.697898 -121.377687 \n", + "696 Residential Mon May 19 00:00:00 EDT 2008 167293 38.715781 -121.298519 \n", + "602 Residential Mon May 19 00:00:00 EDT 2008 127500 38.490690 -121.382619 \n", + "229 Residential Mon May 19 00:00:00 EDT 2008 232000 38.490260 -121.550527 \n", + "456 Residential Tue May 20 00:00:00 EDT 2008 300500 38.677454 -121.494791 \n", + "526 Residential Mon May 19 00:00:00 EDT 2008 213697 38.423963 -121.428875 \n", + "531 Residential Tue May 20 00:00:00 EDT 2008 135000 38.658478 -121.450409 \n", + "252 Residential Fri May 16 00:00:00 EDT 2008 115500 38.563402 -121.327747 \n", + "227 Residential Fri May 16 00:00:00 EDT 2008 450000 38.621566 -121.370882 \n", + "567 Residential Mon May 19 00:00:00 EDT 2008 247234 38.489215 -121.419546 \n", + "162 Residential Fri May 16 00:00:00 EDT 2008 116000 38.593680 -121.316054 \n", + "240 Residential Tue May 20 00:00:00 EDT 2008 245000 38.562296 -121.467489 \n", + "75 Residential Fri May 16 00:00:00 EDT 2008 108000 38.520319 -121.504727 \n", + "388 Residential Wed May 21 00:00:00 EDT 2008 254200 38.386920 -121.447349 \n", + "657 Residential Thu May 15 00:00:00 EDT 2008 151087 38.466160 -121.448283 \n", + "279 Residential Wed May 21 00:00:00 EDT 2008 125000 38.701499 -121.376220 \n", + "601 Residential Tue May 20 00:00:00 EDT 2008 245000 38.699489 -121.361828 \n", + "788 Residential Fri May 16 00:00:00 EDT 2008 300000 38.404505 -121.346938 \n", + "303 Residential Fri May 16 00:00:00 EDT 2008 168750 38.718460 -121.370862 \n", + "221 Residential Tue May 20 00:00:00 EDT 2008 225000 38.624762 -121.522775 \n", + "4 Residential Mon May 19 00:00:00 EDT 2008 240000 38.634370 -121.510779 \n", + "203 Residential Tue May 20 00:00:00 EDT 2008 495000 38.398488 -121.473424 \n", + "273 Residential Tue May 20 00:00:00 EDT 2008 129000 38.666540 -121.378298 " + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# select random sample (small size)\n", + "\n", + "np.random.seed(123)\n", + "\n", + "small_sacramento = sacramento.sample(n=30)\n", + "small_sacramento" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot\n", + "plt.scatter(small_sacramento[\"sq__ft\"], small_sacramento['price'])\n", + "\n", + "# Add a vertical line at 2,000 square feet\n", + "plt.axvline(x=2000, color='red', linestyle='--', label='2000 sqft')\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel(\"House size (square feet)\")\n", + "plt.ylabel('Price (USD)')\n", + "plt.title('Scatter Plot of House size vs Price')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "185 1196\n", + "210 214\n", + "487 649\n", + "267 175\n", + "674 685\n", + "177 218\n", + "217 750\n", + "589 848\n", + "696 720\n", + "602 413\n", + "229 625\n", + "456 90\n", + "526 888\n", + "531 789\n", + "252 680\n", + "227 53\n", + "567 824\n", + "162 885\n", + "240 731\n", + "75 1190\n", + "388 157\n", + "657 783\n", + "279 1099\n", + "601 497\n", + "788 555\n", + "303 331\n", + "221 478\n", + "4 1076\n", + "203 1070\n", + "273 1043\n", + "Name: dist, dtype: int64" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# abs difference between our house and the data point (observation)\n", + "\n", + "small_sacramento[\"dist\"] = (2000 - small_sacramento[\"sq__ft\"]).abs()\n", + "small_sacramento[\"dist\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
streetcityzipstatebedsbathssq__fttypesale_datepricelatitudelongitudedist
2273108 DELWOOD WAYSACRAMENTO95821CA422053ResidentialFri May 16 00:00:00 EDT 200845000038.621566-121.37088253
4565651 OVERLEAF WAYSACRAMENTO95835CA421910ResidentialTue May 20 00:00:00 EDT 200830050038.677454-121.49479190
3884901 MILLNER WAYELK GROVE95757CA321843ResidentialWed May 21 00:00:00 EDT 200825420038.386920-121.447349157
267361 MAHONIA CIRSACRAMENTO95835CA432175ResidentialMon May 19 00:00:00 EDT 200826100038.676172-121.509761175
210294 SPARROW DRGALT95632CA432214ResidentialFri May 16 00:00:00 EDT 200827800038.258976-121.321266214
\n", + "
" + ], + "text/plain": [ + " street city zip state beds baths sq__ft \\\n", + "227 3108 DELWOOD WAY SACRAMENTO 95821 CA 4 2 2053 \n", + "456 5651 OVERLEAF WAY SACRAMENTO 95835 CA 4 2 1910 \n", + "388 4901 MILLNER WAY ELK GROVE 95757 CA 3 2 1843 \n", + "267 361 MAHONIA CIR SACRAMENTO 95835 CA 4 3 2175 \n", + "210 294 SPARROW DR GALT 95632 CA 4 3 2214 \n", + "\n", + " type sale_date price latitude longitude \\\n", + "227 Residential Fri May 16 00:00:00 EDT 2008 450000 38.621566 -121.370882 \n", + "456 Residential Tue May 20 00:00:00 EDT 2008 300500 38.677454 -121.494791 \n", + "388 Residential Wed May 21 00:00:00 EDT 2008 254200 38.386920 -121.447349 \n", + "267 Residential Mon May 19 00:00:00 EDT 2008 261000 38.676172 -121.509761 \n", + "210 Residential Fri May 16 00:00:00 EDT 2008 278000 38.258976 -121.321266 \n", + "\n", + " dist \n", + "227 53 \n", + "456 90 \n", + "388 157 \n", + "267 175 \n", + "210 214 " + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nearest_neighbors = small_sacramento.nsmallest(5, \"dist\")\n", + "nearest_neighbors" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Scatter plot\n", + "plt.scatter(small_sacramento[\"sq__ft\"], small_sacramento['price'], label='All houses')\n", + "\n", + "# Plot nearest neighbors in orange\n", + "plt.scatter(nearest_neighbors[\"sq__ft\"], nearest_neighbors['price'], color='orange', label='Nearest neighbors', edgecolor='black')\n", + "\n", + "# Add a vertical line at 2,000 square feet\n", + "plt.axvline(x=2000, color='red', linestyle='--', label='2000 sqft')\n", + "\n", + "# Add labels, title, and legend\n", + "plt.xlabel(\"House size (square feet)\")\n", + "plt.ylabel('Price (USD)')\n", + "plt.title('Scatter Plot of House Size vs Price')\n", + "plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "308740.0" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prediction = nearest_neighbors[\"price\"].mean()\n", + "prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "sacramento_train, sacramento_test = train_test_split(\n", + " sacramento, train_size=0.75, random_state=42\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = sacramento_train[[\"sq__ft\"]]\n", + "y_train = sacramento_train[\"price\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "knn_regressor = KNeighborsRegressor()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = {\n", + " \"n_neighbors\": range(1, 201, 3), # But wait...? What is this?\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [], + "source": [ + "sacr_gridsearch = GridSearchCV(\n", + " estimator=knn_regressor,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring=\"neg_root_mean_squared_error\" # we can also use \"R2\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5, estimator=KNeighborsRegressor(),\n",
+       "             param_grid={'n_neighbors': range(1, 201, 3)},\n",
+       "             scoring='neg_root_mean_squared_error')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5, estimator=KNeighborsRegressor(),\n", + " param_grid={'n_neighbors': range(1, 201, 3)},\n", + " scoring='neg_root_mean_squared_error')" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sacr_gridsearch.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_fit_timestd_fit_timemean_score_timestd_score_timeparam_n_neighborsparamssplit0_test_scoresplit1_test_scoresplit2_test_scoresplit3_test_scoresplit4_test_scoremean_test_scorestd_test_scorerank_test_score
00.0159500.0284510.0051700.0053171{'n_neighbors': 1}-112075.959498-113997.393596-117026.607659-120574.497484-98703.354492-112475.5625467462.62856967
10.0018630.0007500.0022980.0012534{'n_neighbors': 4}-87544.667287-83883.876555-83913.469660-104704.353254-85027.749120-89014.8231757957.00814256
20.0019030.0008030.0035730.0046617{'n_neighbors': 7}-86504.059436-82825.683348-76131.355307-102065.903265-79440.331044-85393.4664809022.45643035
30.0018450.0004270.0021030.00066210{'n_neighbors': 10}-84090.805474-82910.735403-78152.835102-102051.387567-74177.616156-84276.6759409563.81879727
40.0018090.0005170.0019030.00066413{'n_neighbors': 13}-84904.213103-81367.275879-79773.776612-100931.021005-75241.782711-84443.6138628808.99174529
.............................................
620.0019020.0008020.0047370.000988187{'n_neighbors': 187}-92650.337006-88889.235388-90004.888176-101845.246945-77821.033008-90242.1481057701.97431662
630.0015300.0004800.0040110.000558190{'n_neighbors': 190}-92895.085263-89245.862528-90289.489219-102084.469772-78056.396280-90514.2606127699.29614563
640.0015810.0004760.0043670.001271193{'n_neighbors': 193}-93007.820652-89594.971914-90591.944719-102189.040603-78243.200165-90725.3956107664.53508864
650.0021370.0004830.0050880.002020196{'n_neighbors': 196}-93187.805582-89786.909552-90765.041431-102241.600388-78394.469000-90875.1651907635.55315165
660.0018140.0005080.0040200.000840199{'n_neighbors': 199}-93500.570900-90248.379001-90744.950994-102504.487078-78564.416074-91112.5608107665.72188166
\n", + "

67 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", + "0 0.015950 0.028451 0.005170 0.005317 \n", + "1 0.001863 0.000750 0.002298 0.001253 \n", + "2 0.001903 0.000803 0.003573 0.004661 \n", + "3 0.001845 0.000427 0.002103 0.000662 \n", + "4 0.001809 0.000517 0.001903 0.000664 \n", + ".. ... ... ... ... \n", + "62 0.001902 0.000802 0.004737 0.000988 \n", + "63 0.001530 0.000480 0.004011 0.000558 \n", + "64 0.001581 0.000476 0.004367 0.001271 \n", + "65 0.002137 0.000483 0.005088 0.002020 \n", + "66 0.001814 0.000508 0.004020 0.000840 \n", + "\n", + " param_n_neighbors params split0_test_score \\\n", + "0 1 {'n_neighbors': 1} -112075.959498 \n", + "1 4 {'n_neighbors': 4} -87544.667287 \n", + "2 7 {'n_neighbors': 7} -86504.059436 \n", + "3 10 {'n_neighbors': 10} -84090.805474 \n", + "4 13 {'n_neighbors': 13} -84904.213103 \n", + ".. ... ... ... \n", + "62 187 {'n_neighbors': 187} -92650.337006 \n", + "63 190 {'n_neighbors': 190} -92895.085263 \n", + "64 193 {'n_neighbors': 193} -93007.820652 \n", + "65 196 {'n_neighbors': 196} -93187.805582 \n", + "66 199 {'n_neighbors': 199} -93500.570900 \n", + "\n", + " split1_test_score split2_test_score split3_test_score \\\n", + "0 -113997.393596 -117026.607659 -120574.497484 \n", + "1 -83883.876555 -83913.469660 -104704.353254 \n", + "2 -82825.683348 -76131.355307 -102065.903265 \n", + "3 -82910.735403 -78152.835102 -102051.387567 \n", + "4 -81367.275879 -79773.776612 -100931.021005 \n", + ".. ... ... ... \n", + "62 -88889.235388 -90004.888176 -101845.246945 \n", + "63 -89245.862528 -90289.489219 -102084.469772 \n", + "64 -89594.971914 -90591.944719 -102189.040603 \n", + "65 -89786.909552 -90765.041431 -102241.600388 \n", + "66 -90248.379001 -90744.950994 -102504.487078 \n", + "\n", + " split4_test_score mean_test_score std_test_score rank_test_score \n", + "0 -98703.354492 -112475.562546 7462.628569 67 \n", + "1 -85027.749120 -89014.823175 7957.008142 56 \n", + "2 -79440.331044 -85393.466480 9022.456430 35 \n", + "3 -74177.616156 -84276.675940 9563.818797 27 \n", + "4 -75241.782711 -84443.613862 8808.991745 29 \n", + ".. ... ... ... ... \n", + "62 -77821.033008 -90242.148105 7701.974316 62 \n", + "63 -78056.396280 -90514.260612 7699.296145 63 \n", + "64 -78243.200165 -90725.395610 7664.535088 64 \n", + "65 -78394.469000 -90875.165190 7635.553151 65 \n", + "66 -78564.416074 -91112.560810 7665.721881 66 \n", + "\n", + "[67 rows x 14 columns]" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results = pd.DataFrame(sacr_gridsearch.cv_results_)\n", + "results # After fitting the model, we extract the cross-validation results using `cv_results_`. This output includes various metrics and parameters tested during the cross-validation process.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_fit_timestd_fit_timemean_score_timestd_score_timeparam_n_neighborsparamssplit0_test_scoresplit1_test_scoresplit2_test_scoresplit3_test_scoresplit4_test_scoremean_test_scorestd_test_scorerank_test_score
00.0159500.0284510.0051700.0053171{'n_neighbors': 1}-112075.959498-113997.393596-117026.607659-120574.497484-98703.354492112475.5625467462.62856967
10.0018630.0007500.0022980.0012534{'n_neighbors': 4}-87544.667287-83883.876555-83913.469660-104704.353254-85027.74912089014.8231757957.00814256
20.0019030.0008030.0035730.0046617{'n_neighbors': 7}-86504.059436-82825.683348-76131.355307-102065.903265-79440.33104485393.4664809022.45643035
30.0018450.0004270.0021030.00066210{'n_neighbors': 10}-84090.805474-82910.735403-78152.835102-102051.387567-74177.61615684276.6759409563.81879727
40.0018090.0005170.0019030.00066413{'n_neighbors': 13}-84904.213103-81367.275879-79773.776612-100931.021005-75241.78271184443.6138628808.99174529
.............................................
620.0019020.0008020.0047370.000988187{'n_neighbors': 187}-92650.337006-88889.235388-90004.888176-101845.246945-77821.03300890242.1481057701.97431662
630.0015300.0004800.0040110.000558190{'n_neighbors': 190}-92895.085263-89245.862528-90289.489219-102084.469772-78056.39628090514.2606127699.29614563
640.0015810.0004760.0043670.001271193{'n_neighbors': 193}-93007.820652-89594.971914-90591.944719-102189.040603-78243.20016590725.3956107664.53508864
650.0021370.0004830.0050880.002020196{'n_neighbors': 196}-93187.805582-89786.909552-90765.041431-102241.600388-78394.46900090875.1651907635.55315165
660.0018140.0005080.0040200.000840199{'n_neighbors': 199}-93500.570900-90248.379001-90744.950994-102504.487078-78564.41607491112.5608107665.72188166
\n", + "

67 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", + "0 0.015950 0.028451 0.005170 0.005317 \n", + "1 0.001863 0.000750 0.002298 0.001253 \n", + "2 0.001903 0.000803 0.003573 0.004661 \n", + "3 0.001845 0.000427 0.002103 0.000662 \n", + "4 0.001809 0.000517 0.001903 0.000664 \n", + ".. ... ... ... ... \n", + "62 0.001902 0.000802 0.004737 0.000988 \n", + "63 0.001530 0.000480 0.004011 0.000558 \n", + "64 0.001581 0.000476 0.004367 0.001271 \n", + "65 0.002137 0.000483 0.005088 0.002020 \n", + "66 0.001814 0.000508 0.004020 0.000840 \n", + "\n", + " param_n_neighbors params split0_test_score \\\n", + "0 1 {'n_neighbors': 1} -112075.959498 \n", + "1 4 {'n_neighbors': 4} -87544.667287 \n", + "2 7 {'n_neighbors': 7} -86504.059436 \n", + "3 10 {'n_neighbors': 10} -84090.805474 \n", + "4 13 {'n_neighbors': 13} -84904.213103 \n", + ".. ... ... ... \n", + "62 187 {'n_neighbors': 187} -92650.337006 \n", + "63 190 {'n_neighbors': 190} -92895.085263 \n", + "64 193 {'n_neighbors': 193} -93007.820652 \n", + "65 196 {'n_neighbors': 196} -93187.805582 \n", + "66 199 {'n_neighbors': 199} -93500.570900 \n", + "\n", + " split1_test_score split2_test_score split3_test_score \\\n", + "0 -113997.393596 -117026.607659 -120574.497484 \n", + "1 -83883.876555 -83913.469660 -104704.353254 \n", + "2 -82825.683348 -76131.355307 -102065.903265 \n", + "3 -82910.735403 -78152.835102 -102051.387567 \n", + "4 -81367.275879 -79773.776612 -100931.021005 \n", + ".. ... ... ... \n", + "62 -88889.235388 -90004.888176 -101845.246945 \n", + "63 -89245.862528 -90289.489219 -102084.469772 \n", + "64 -89594.971914 -90591.944719 -102189.040603 \n", + "65 -89786.909552 -90765.041431 -102241.600388 \n", + "66 -90248.379001 -90744.950994 -102504.487078 \n", + "\n", + " split4_test_score mean_test_score std_test_score rank_test_score \n", + "0 -98703.354492 112475.562546 7462.628569 67 \n", + "1 -85027.749120 89014.823175 7957.008142 56 \n", + "2 -79440.331044 85393.466480 9022.456430 35 \n", + "3 -74177.616156 84276.675940 9563.818797 27 \n", + "4 -75241.782711 84443.613862 8808.991745 29 \n", + ".. ... ... ... ... \n", + "62 -77821.033008 90242.148105 7701.974316 62 \n", + "63 -78056.396280 90514.260612 7699.296145 63 \n", + "64 -78243.200165 90725.395610 7664.535088 64 \n", + "65 -78394.469000 90875.165190 7635.553151 65 \n", + "66 -78564.416074 91112.560810 7665.721881 66 \n", + "\n", + "[67 rows x 14 columns]" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# absolute the result\n", + "results[\"mean_test_score\"]=results[\"mean_test_score\"].abs()\n", + "results" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create the plot\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "# Plot mean test scores with error bars\n", + "plt.plot(results['param_n_neighbors'], results['mean_test_score'], '-o', color='blue')\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Number of Neighbors')\n", + "plt.ylabel('Accuracy estimate')\n", + "plt.title('K-Nearest Neighbors Regression Performance')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_neighbors': 25}" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sacr_gridsearch.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## break" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [], + "source": [ + "sacramento_test[\"predicted\"] = sacr_gridsearch.predict(sacramento_test[[\"sq__ft\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "93573.27378694214" + ] + }, + "execution_count": 120, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmspe = mean_squared_error(\n", + " y_true=sacramento_test[\"price\"],\n", + " y_pred=sacramento_test[\"predicted\"]\n", + ")**0.5\n", + "\n", + "rmspe" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'r2_score' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[124], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m r2 \u001b[38;5;241m=\u001b[39m \u001b[43mr2_score\u001b[49m(\n\u001b[0;32m 2\u001b[0m y_true\u001b[38;5;241m=\u001b[39msacramento_test[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprice\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m 3\u001b[0m y_pred\u001b[38;5;241m=\u001b[39msacramento_test[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpredicted\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 4\u001b[0m )\n\u001b[0;32m 6\u001b[0m r2\n", + "\u001b[1;31mNameError\u001b[0m: name 'r2_score' is not defined" + ] + } + ], + "source": [ + "r2 = r2_score(\n", + " y_true=sacramento_test[\"price\"],\n", + " y_pred=sacramento_test[\"predicted\"]\n", + ")\n", + "\n", + "r2" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\tinti\\miniconda3\\envs\\dsi_participant\\lib\\site-packages\\sklearn\\base.py:493: UserWarning: X does not have valid feature names, but KNeighborsRegressor was fitted with feature names\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Generate a range of house sizes for prediction\n", + "sizes = np.linspace(sacramento[\"sq__ft\"].min(), sacramento[\"sq__ft\"].max(), 100).reshape(-1, 1)\n", + "\n", + "# Predict house prices for these sizes using the best model from GridSearchCV\n", + "predicted_prices = sacr_gridsearch.predict(sizes)\n", + "\n", + "# Plot the original data\n", + "plt.scatter(sacramento[\"sq__ft\"], sacramento[\"price\"], label=\"Actual Prices\")\n", + "\n", + "# Plot the model predictions as a line\n", + "plt.plot(sizes, predicted_prices, color='red', label=\"Model Predictions\")\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel(\"House size (square feet)\")\n", + "plt.ylabel(\"Price (USD)\")\n", + "plt.title(\"Scatter Plot of House Size vs Price with Model Predictions\")\n", + "plt.legend()\n", + "plt.show();" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dsi_participant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/01_materials/notebooks/Classification-2.ipynb b/01_materials/notebooks/Classification-2.ipynb index 96db650b8..fa1a27830 100644 --- a/01_materials/notebooks/Classification-2.ipynb +++ b/01_materials/notebooks/Classification-2.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -424,7 +424,7 @@ "[569 rows x 32 columns]" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -444,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -453,7 +453,7 @@ "array(['Malignant', 'Benign'], dtype=object)" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -478,7 +478,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -864,7 +864,7 @@ "[569 rows x 32 columns]" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -956,7 +956,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -978,7 +978,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -986,7 +986,7 @@ "output_type": "stream", "text": [ "\n", - "Int64Index: 426 entries, 164 to 284\n", + "Index: 426 entries, 164 to 284\n", "Data columns (total 32 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", @@ -1033,18 +1033,19 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ + "diagnosis\n", "Benign 0.626761\n", "Malignant 0.373239\n", - "Name: diagnosis, dtype: float64" + "Name: proportion, dtype: float64" ] }, - "execution_count": 11, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1070,19 +1071,423 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "KNeighborsClassifier()" ] }, - "execution_count": 12, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1094,19 +1499,423 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "KNeighborsClassifier()" ] }, - "execution_count": 13, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1124,7 +1933,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1242,7 +2051,7 @@ "[143 rows x 3 columns]" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1277,7 +2086,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1286,7 +2095,7 @@ "0.9230769230769231" ] }, - "execution_count": 15, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1363,7 +2172,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1418,7 +2227,7 @@ "Malignant 9 44" ] }, - "execution_count": 16, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1488,7 +2297,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1497,7 +2306,7 @@ "0.9565217391304348" ] }, - "execution_count": 17, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1523,7 +2332,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1532,7 +2341,7 @@ "0.8301886792452831" ] }, - "execution_count": 18, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1592,7 +2401,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1601,7 +2410,7 @@ "0.8785046728971962" ] }, - "execution_count": 19, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1671,7 +2480,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1703,32 +2512,32 @@ " \n", " \n", " 0\n", - " 0.002303\n", - " 0.004767\n", + " 0.011669\n", + " 0.005997\n", " 0.930233\n", " \n", " \n", " 1\n", - " 0.001559\n", - " 0.003668\n", + " 0.001999\n", + " 0.009697\n", " 0.894118\n", " \n", " \n", " 2\n", - " 0.001143\n", - " 0.002115\n", + " 0.001734\n", + " 0.000000\n", " 0.870588\n", " \n", " \n", " 3\n", - " 0.001001\n", - " 0.001845\n", + " 0.000000\n", + " 0.011973\n", " 0.952941\n", " \n", " \n", " 4\n", - " 0.000851\n", - " 0.001786\n", + " 0.002000\n", + " 0.004999\n", " 0.917647\n", " \n", " \n", @@ -1737,14 +2546,14 @@ ], "text/plain": [ " fit_time score_time test_score\n", - "0 0.002303 0.004767 0.930233\n", - "1 0.001559 0.003668 0.894118\n", - "2 0.001143 0.002115 0.870588\n", - "3 0.001001 0.001845 0.952941\n", - "4 0.000851 0.001786 0.917647" + "0 0.011669 0.005997 0.930233\n", + "1 0.001999 0.009697 0.894118\n", + "2 0.001734 0.000000 0.870588\n", + "3 0.000000 0.011973 0.952941\n", + "4 0.002000 0.004999 0.917647" ] }, - "execution_count": 20, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1768,7 +2577,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1800,14 +2609,14 @@ " \n", " \n", " mean\n", - " 0.001371\n", - " 0.002836\n", + " 0.003480\n", + " 0.006533\n", " 0.913105\n", " \n", " \n", " sem\n", - " 0.000261\n", - " 0.000593\n", + " 0.002081\n", + " 0.002061\n", " 0.014264\n", " \n", " \n", @@ -1816,11 +2625,11 @@ ], "text/plain": [ " fit_time score_time test_score\n", - "mean 0.001371 0.002836 0.913105\n", - "sem 0.000261 0.000593 0.014264" + "mean 0.003480 0.006533 0.913105\n", + "sem 0.002081 0.002061 0.014264" ] }, - "execution_count": 21, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1884,7 +2693,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -1903,7 +2712,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -1916,7 +2725,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1964,10 +2773,10 @@ " \n", " \n", " 0\n", - " 0.001236\n", - " 0.000532\n", - " 0.001921\n", - " 0.001362\n", + " 0.002365\n", + " 0.002850\n", + " 0.004009\n", + " 0.002898\n", " 1\n", " {'n_neighbors': 1}\n", " 0.953488\n", @@ -1986,10 +2795,10 @@ " \n", " \n", " 1\n", - " 0.000739\n", - " 0.000039\n", + " 0.002970\n", + " 0.004753\n", " 0.001109\n", - " 0.000077\n", + " 0.001382\n", " 6\n", " {'n_neighbors': 6}\n", " 0.930233\n", @@ -2008,10 +2817,10 @@ " \n", " \n", " 2\n", - " 0.000642\n", - " 0.000028\n", - " 0.000983\n", - " 0.000054\n", + " 0.003535\n", + " 0.005796\n", + " 0.002741\n", + " 0.003816\n", " 11\n", " {'n_neighbors': 11}\n", " 0.906977\n", @@ -2030,10 +2839,10 @@ " \n", " \n", " 3\n", - " 0.000591\n", - " 0.000042\n", - " 0.000902\n", - " 0.000028\n", + " 0.001804\n", + " 0.005148\n", + " 0.004924\n", + " 0.006816\n", " 16\n", " {'n_neighbors': 16}\n", " 0.906977\n", @@ -2052,10 +2861,10 @@ " \n", " \n", " 4\n", - " 0.000560\n", - " 0.000055\n", - " 0.000879\n", - " 0.000042\n", + " 0.001850\n", + " 0.003662\n", + " 0.004817\n", + " 0.006118\n", " 21\n", " {'n_neighbors': 21}\n", " 0.906977\n", @@ -2074,10 +2883,10 @@ " \n", " \n", " 5\n", - " 0.000539\n", - " 0.000011\n", - " 0.000888\n", - " 0.000066\n", + " 0.001878\n", + " 0.004291\n", + " 0.003432\n", + " 0.006023\n", " 26\n", " {'n_neighbors': 26}\n", " 0.906977\n", @@ -2096,10 +2905,10 @@ " \n", " \n", " 6\n", - " 0.000553\n", - " 0.000030\n", - " 0.000899\n", - " 0.000049\n", + " 0.002092\n", + " 0.004207\n", + " 0.002637\n", + " 0.004826\n", " 31\n", " {'n_neighbors': 31}\n", " 0.906977\n", @@ -2118,10 +2927,10 @@ " \n", " \n", " 7\n", - " 0.000532\n", - " 0.000011\n", - " 0.000890\n", - " 0.000015\n", + " 0.000296\n", + " 0.000625\n", + " 0.005729\n", + " 0.007880\n", " 36\n", " {'n_neighbors': 36}\n", " 0.906977\n", @@ -2140,10 +2949,10 @@ " \n", " \n", " 8\n", - " 0.000541\n", - " 0.000023\n", - " 0.000918\n", - " 0.000048\n", + " 0.002190\n", + " 0.004474\n", + " 0.003729\n", + " 0.005384\n", " 41\n", " {'n_neighbors': 41}\n", " 0.906977\n", @@ -2162,10 +2971,10 @@ " \n", " \n", " 9\n", - " 0.000551\n", - " 0.000041\n", - " 0.000936\n", - " 0.000028\n", + " 0.003755\n", + " 0.005865\n", + " 0.001121\n", + " 0.002985\n", " 46\n", " {'n_neighbors': 46}\n", " 0.906977\n", @@ -2184,10 +2993,10 @@ " \n", " \n", " 10\n", - " 0.000554\n", - " 0.000036\n", - " 0.000965\n", - " 0.000057\n", + " 0.001470\n", + " 0.003812\n", + " 0.004952\n", + " 0.007163\n", " 51\n", " {'n_neighbors': 51}\n", " 0.906977\n", @@ -2206,10 +3015,10 @@ " \n", " \n", " 11\n", - " 0.000553\n", - " 0.000047\n", - " 0.000977\n", - " 0.000046\n", + " 0.001652\n", + " 0.003419\n", + " 0.005267\n", + " 0.008048\n", " 56\n", " {'n_neighbors': 56}\n", " 0.906977\n", @@ -2228,10 +3037,10 @@ " \n", " \n", " 12\n", - " 0.000552\n", - " 0.000017\n", - " 0.001015\n", - " 0.000091\n", + " 0.001976\n", + " 0.003919\n", + " 0.003366\n", + " 0.006761\n", " 61\n", " {'n_neighbors': 61}\n", " 0.906977\n", @@ -2250,10 +3059,10 @@ " \n", " \n", " 13\n", - " 0.000573\n", - " 0.000072\n", - " 0.001017\n", - " 0.000080\n", + " 0.004449\n", + " 0.006554\n", + " 0.002174\n", + " 0.004767\n", " 66\n", " {'n_neighbors': 66}\n", " 0.930233\n", @@ -2272,10 +3081,10 @@ " \n", " \n", " 14\n", - " 0.000530\n", - " 0.000008\n", - " 0.001025\n", - " 0.000093\n", + " 0.003282\n", + " 0.005389\n", + " 0.002785\n", + " 0.005229\n", " 71\n", " {'n_neighbors': 71}\n", " 0.930233\n", @@ -2294,10 +3103,10 @@ " \n", " \n", " 15\n", - " 0.000552\n", - " 0.000045\n", - " 0.001024\n", - " 0.000052\n", + " 0.001471\n", + " 0.004413\n", + " 0.003773\n", + " 0.006788\n", " 76\n", " {'n_neighbors': 76}\n", " 0.930233\n", @@ -2316,10 +3125,10 @@ " \n", " \n", " 16\n", - " 0.000528\n", - " 0.000015\n", - " 0.001038\n", - " 0.000071\n", + " 0.003095\n", + " 0.006271\n", + " 0.003519\n", + " 0.005176\n", " 81\n", " {'n_neighbors': 81}\n", " 0.930233\n", @@ -2338,10 +3147,10 @@ " \n", " \n", " 17\n", - " 0.000538\n", - " 0.000022\n", - " 0.001039\n", - " 0.000036\n", + " 0.000263\n", + " 0.000789\n", + " 0.005718\n", + " 0.006187\n", " 86\n", " {'n_neighbors': 86}\n", " 0.906977\n", @@ -2360,10 +3169,10 @@ " \n", " \n", " 18\n", - " 0.000535\n", - " 0.000017\n", - " 0.001055\n", - " 0.000070\n", + " 0.004861\n", + " 0.007433\n", + " 0.000088\n", + " 0.000264\n", " 91\n", " {'n_neighbors': 91}\n", " 0.906977\n", @@ -2382,10 +3191,10 @@ " \n", " \n", " 19\n", - " 0.000548\n", - " 0.000048\n", - " 0.001064\n", - " 0.000042\n", + " 0.001921\n", + " 0.004337\n", + " 0.004902\n", + " 0.007439\n", " 96\n", " {'n_neighbors': 96}\n", " 0.906977\n", @@ -2408,48 +3217,48 @@ ], "text/plain": [ " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", - "0 0.001236 0.000532 0.001921 0.001362 \n", - "1 0.000739 0.000039 0.001109 0.000077 \n", - "2 0.000642 0.000028 0.000983 0.000054 \n", - "3 0.000591 0.000042 0.000902 0.000028 \n", - "4 0.000560 0.000055 0.000879 0.000042 \n", - "5 0.000539 0.000011 0.000888 0.000066 \n", - "6 0.000553 0.000030 0.000899 0.000049 \n", - "7 0.000532 0.000011 0.000890 0.000015 \n", - "8 0.000541 0.000023 0.000918 0.000048 \n", - "9 0.000551 0.000041 0.000936 0.000028 \n", - "10 0.000554 0.000036 0.000965 0.000057 \n", - "11 0.000553 0.000047 0.000977 0.000046 \n", - "12 0.000552 0.000017 0.001015 0.000091 \n", - "13 0.000573 0.000072 0.001017 0.000080 \n", - "14 0.000530 0.000008 0.001025 0.000093 \n", - "15 0.000552 0.000045 0.001024 0.000052 \n", - "16 0.000528 0.000015 0.001038 0.000071 \n", - "17 0.000538 0.000022 0.001039 0.000036 \n", - "18 0.000535 0.000017 0.001055 0.000070 \n", - "19 0.000548 0.000048 0.001064 0.000042 \n", - "\n", - " param_n_neighbors params split0_test_score \\\n", - "0 1 {'n_neighbors': 1} 0.953488 \n", - "1 6 {'n_neighbors': 6} 0.930233 \n", - "2 11 {'n_neighbors': 11} 0.906977 \n", - "3 16 {'n_neighbors': 16} 0.906977 \n", - "4 21 {'n_neighbors': 21} 0.906977 \n", - "5 26 {'n_neighbors': 26} 0.906977 \n", - "6 31 {'n_neighbors': 31} 0.906977 \n", - "7 36 {'n_neighbors': 36} 0.906977 \n", - "8 41 {'n_neighbors': 41} 0.906977 \n", - "9 46 {'n_neighbors': 46} 0.906977 \n", - "10 51 {'n_neighbors': 51} 0.906977 \n", - "11 56 {'n_neighbors': 56} 0.906977 \n", - "12 61 {'n_neighbors': 61} 0.906977 \n", - "13 66 {'n_neighbors': 66} 0.930233 \n", - "14 71 {'n_neighbors': 71} 0.930233 \n", - "15 76 {'n_neighbors': 76} 0.930233 \n", - "16 81 {'n_neighbors': 81} 0.930233 \n", - "17 86 {'n_neighbors': 86} 0.906977 \n", - "18 91 {'n_neighbors': 91} 0.906977 \n", - "19 96 {'n_neighbors': 96} 0.906977 \n", + "0 0.002365 0.002850 0.004009 0.002898 \n", + "1 0.002970 0.004753 0.001109 0.001382 \n", + "2 0.003535 0.005796 0.002741 0.003816 \n", + "3 0.001804 0.005148 0.004924 0.006816 \n", + "4 0.001850 0.003662 0.004817 0.006118 \n", + "5 0.001878 0.004291 0.003432 0.006023 \n", + "6 0.002092 0.004207 0.002637 0.004826 \n", + "7 0.000296 0.000625 0.005729 0.007880 \n", + "8 0.002190 0.004474 0.003729 0.005384 \n", + "9 0.003755 0.005865 0.001121 0.002985 \n", + "10 0.001470 0.003812 0.004952 0.007163 \n", + "11 0.001652 0.003419 0.005267 0.008048 \n", + "12 0.001976 0.003919 0.003366 0.006761 \n", + "13 0.004449 0.006554 0.002174 0.004767 \n", + "14 0.003282 0.005389 0.002785 0.005229 \n", + "15 0.001471 0.004413 0.003773 0.006788 \n", + "16 0.003095 0.006271 0.003519 0.005176 \n", + "17 0.000263 0.000789 0.005718 0.006187 \n", + "18 0.004861 0.007433 0.000088 0.000264 \n", + "19 0.001921 0.004337 0.004902 0.007439 \n", + "\n", + " param_n_neighbors params split0_test_score \\\n", + "0 1 {'n_neighbors': 1} 0.953488 \n", + "1 6 {'n_neighbors': 6} 0.930233 \n", + "2 11 {'n_neighbors': 11} 0.906977 \n", + "3 16 {'n_neighbors': 16} 0.906977 \n", + "4 21 {'n_neighbors': 21} 0.906977 \n", + "5 26 {'n_neighbors': 26} 0.906977 \n", + "6 31 {'n_neighbors': 31} 0.906977 \n", + "7 36 {'n_neighbors': 36} 0.906977 \n", + "8 41 {'n_neighbors': 41} 0.906977 \n", + "9 46 {'n_neighbors': 46} 0.906977 \n", + "10 51 {'n_neighbors': 51} 0.906977 \n", + "11 56 {'n_neighbors': 56} 0.906977 \n", + "12 61 {'n_neighbors': 61} 0.906977 \n", + "13 66 {'n_neighbors': 66} 0.930233 \n", + "14 71 {'n_neighbors': 71} 0.930233 \n", + "15 76 {'n_neighbors': 76} 0.930233 \n", + "16 81 {'n_neighbors': 81} 0.930233 \n", + "17 86 {'n_neighbors': 86} 0.906977 \n", + "18 91 {'n_neighbors': 91} 0.906977 \n", + "19 96 {'n_neighbors': 96} 0.906977 \n", "\n", " split1_test_score split2_test_score split3_test_score \\\n", "0 0.837209 0.906977 0.860465 \n", @@ -2540,7 +3349,7 @@ "19 0.047625 14 " ] }, - "execution_count": 24, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2601,12 +3410,12 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -2789,7 +3598,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "dsi_participant", "language": "python", "name": "python3" }, @@ -2803,7 +3612,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.9.15" } }, "nbformat": 4, diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 73d92a3ee..5e3be7065 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +56,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +369,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "178" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# count number of rows\n", + "wine_df.shape[0]" ] }, { @@ -109,12 +399,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# count number of columns\n", + "wine_df.shape[1]" ] }, { @@ -127,12 +429,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('int32')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# data type of column class\n", + "wine_df.dtypes['class']" ] }, { @@ -146,12 +460,329 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 178 entries, 0 to 177\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 178 non-null float64\n", + " 1 malic_acid 178 non-null float64\n", + " 2 ash 178 non-null float64\n", + " 3 alcalinity_of_ash 178 non-null float64\n", + " 4 magnesium 178 non-null float64\n", + " 5 total_phenols 178 non-null float64\n", + " 6 flavanoids 178 non-null float64\n", + " 7 nonflavanoid_phenols 178 non-null float64\n", + " 8 proanthocyanins 178 non-null float64\n", + " 9 color_intensity 178 non-null float64\n", + " 10 hue 178 non-null float64\n", + " 11 od280/od315_of_diluted_wines 178 non-null float64\n", + " 12 proline 178 non-null float64\n", + " 13 class 178 non-null int32 \n", + "dtypes: float64(13), int32(1)\n", + "memory usage: 18.9 KB\n" + ] + } + ], "source": [ - "# Your answer here" + "# Number of predictor variables is 12\n", + "wine_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "156cc83a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wine_df" ] }, { @@ -175,10 +806,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +862,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "> To make sure all the predictor valiables to have same scale therefore none of them will be dominated because of large scale and skew the classifcation result when using machine learning models that rely on distance metrics." ] }, { @@ -220,7 +878,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "> This is the variable we want to determine through the model and its scale would not affect the classification result " ] }, { @@ -236,7 +894,7 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "> Setting random seeed is important because it allow us to control the randomness in our code. Therefore we can repoduce the same result after running the code and do comparison or testing." ] }, { @@ -251,19 +909,33 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "72c101f2", + "execution_count": 9, + "id": "8e4a9dda", "metadata": {}, "outputs": [], "source": [ "# set a seed for reproducibility\n", "np.random.seed(123)\n", - "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", - "\n", "# Your code here ..." ] }, + { + "cell_type": "code", + "execution_count": 10, + "id": "af2f9ce3", + "metadata": {}, + "outputs": [], + "source": [ + "# split the data into training and testing set\n", + "\n", + "predictor_S_train, predictor_S_test, label_c_train, label_c_test= train_test_split(\n", + " predictors_standardized, wine_df['class'], train_size=0.75, shuffle= True,\n", + " stratify=wine_df[\"class\"], \n", + " random_state= 123\n", + ")" + ] + }, { "cell_type": "markdown", "id": "4604ee03", @@ -282,14 +954,1125 @@ "4. After fitting the model on the training data, identify and return the best value for `n_neighbors` based on the grid search results." ] }, + { + "cell_type": "markdown", + "id": "905ed370", + "metadata": {}, + "source": [ + "Question 3 - point 1" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "08818c64", "metadata": {}, "outputs": [], "source": [ - "# Your code here..." + "# initiate KNN\n", + "knn = KNeighborsClassifier(n_neighbors=5)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "42b204f2", + "metadata": {}, + "outputs": [], + "source": [ + "# define x and y for KNN\n", + "X_train = predictor_S_train\n", + "y_train = label_c_train" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "44a9ab17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier()" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fitting KNN\n", + "knn.fit(X_train,y_train)" + ] + }, + { + "cell_type": "markdown", + "id": "9ffb8bf8", + "metadata": {}, + "source": [ + "Question 3 point 2" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "58c21754", + "metadata": {}, + "outputs": [], + "source": [ + "# implementing a gridSearch , define pararmeter grid, riging from 1 to 50\n", + "parameter_grid = {\n", + " \"n_neighbors\": range(1, 50, 3),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "ddf8185b", + "metadata": {}, + "source": [ + "Question 3 point 3 " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b3ad80ad", + "metadata": {}, + "outputs": [], + "source": [ + "# use function to search best K -- implementing a gridSearch \n", + "wine_tune_grid = GridSearchCV(\n", + " estimator=knn,\n", + " param_grid=parameter_grid,\n", + " cv=10\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "672c7471", + "metadata": {}, + "source": [ + "Question 3 - point 4" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "9fcf66a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n",
+       "             param_grid={'n_neighbors': range(1, 50, 3)})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n", + " param_grid={'n_neighbors': range(1, 50, 3)})" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fitting the x and y into the tune function\n", + "wine_tune_grid.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e0cca0de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paramsmean_test_score
0{'n_neighbors': 1}0.954396
1{'n_neighbors': 4}0.954945
2{'n_neighbors': 7}0.977473
3{'n_neighbors': 10}0.954396
4{'n_neighbors': 13}0.977473
5{'n_neighbors': 16}0.962637
6{'n_neighbors': 19}0.962637
7{'n_neighbors': 22}0.970330
8{'n_neighbors': 25}0.954945
9{'n_neighbors': 28}0.962637
10{'n_neighbors': 31}0.955495
11{'n_neighbors': 34}0.963187
12{'n_neighbors': 37}0.962637
13{'n_neighbors': 40}0.954945
14{'n_neighbors': 43}0.954945
15{'n_neighbors': 46}0.947253
16{'n_neighbors': 49}0.947253
\n", + "
" + ], + "text/plain": [ + " params mean_test_score\n", + "0 {'n_neighbors': 1} 0.954396\n", + "1 {'n_neighbors': 4} 0.954945\n", + "2 {'n_neighbors': 7} 0.977473\n", + "3 {'n_neighbors': 10} 0.954396\n", + "4 {'n_neighbors': 13} 0.977473\n", + "5 {'n_neighbors': 16} 0.962637\n", + "6 {'n_neighbors': 19} 0.962637\n", + "7 {'n_neighbors': 22} 0.970330\n", + "8 {'n_neighbors': 25} 0.954945\n", + "9 {'n_neighbors': 28} 0.962637\n", + "10 {'n_neighbors': 31} 0.955495\n", + "11 {'n_neighbors': 34} 0.963187\n", + "12 {'n_neighbors': 37} 0.962637\n", + "13 {'n_neighbors': 40} 0.954945\n", + "14 {'n_neighbors': 43} 0.954945\n", + "15 {'n_neighbors': 46} 0.947253\n", + "16 {'n_neighbors': 49} 0.947253" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check out the accuracy and showing just n_neighbour and test score\n", + "accuracies_grid = pd.DataFrame(wine_tune_grid.cv_results_)\n", + "#accuracies_grid\n", + "accuracies_grid [[\"params\",\"mean_test_score\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "369cdf3b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_neighbors': 7}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# optimal number of neighbours\n", + "wine_tune_grid.best_params_" ] }, { @@ -305,12 +2088,821 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# Your code here..." + "# Create the plot- to show the best K value\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "\n", + "# Plot mean test scores with error bars\n", + "plt.plot(accuracies_grid['param_n_neighbors'], accuracies_grid['mean_test_score'], '-o', color='blue')\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Number of Neighbors')\n", + "plt.ylabel('Accuracy estimate')\n", + "plt.title('K-Nearest Neighbors Performance')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a698cfd3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier(n_neighbors=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier(n_neighbors=7)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Initiate the KNN with the best 'n_neighbour' found\n", + "knn = KNeighborsClassifier(n_neighbors=wine_tune_grid.best_params_['n_neighbors'])\n", + "\n", + "# Perform the KNN on test set and find the predication . Define x and y \n", + "X_test = predictor_S_test\n", + "y_test = label_c_test\n", + "\n", + "# fitting KNN into the test set\n", + "knn.fit(X_test,y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f6bad3b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classtest_prediction
10211
8411
9611
6511
7911
1700
10911
11311
2800
15922
3800
3400
12511
11511
7111
7611
13122
3300
6011
1900
11411
4700
4800
15822
13322
13722
15422
13622
200
16822
11711
3200
2200
10811
7310
7711
14222
900
8511
5800
4500
17522
4200
14322
17722
\n", + "
" + ], + "text/plain": [ + " class test_prediction\n", + "102 1 1\n", + "84 1 1\n", + "96 1 1\n", + "65 1 1\n", + "79 1 1\n", + "17 0 0\n", + "109 1 1\n", + "113 1 1\n", + "28 0 0\n", + "159 2 2\n", + "38 0 0\n", + "34 0 0\n", + "125 1 1\n", + "115 1 1\n", + "71 1 1\n", + "76 1 1\n", + "131 2 2\n", + "33 0 0\n", + "60 1 1\n", + "19 0 0\n", + "114 1 1\n", + "47 0 0\n", + "48 0 0\n", + "158 2 2\n", + "133 2 2\n", + "137 2 2\n", + "154 2 2\n", + "136 2 2\n", + "2 0 0\n", + "168 2 2\n", + "117 1 1\n", + "32 0 0\n", + "22 0 0\n", + "108 1 1\n", + "73 1 0\n", + "77 1 1\n", + "142 2 2\n", + "9 0 0\n", + "85 1 1\n", + "58 0 0\n", + "45 0 0\n", + "175 2 2\n", + "42 0 0\n", + "143 2 2\n", + "177 2 2" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# incorporate the test prediction into the test data set and compare\n", + "\n", + "# concatnate the test data into one dataframe \n", + "full_test_data = pd.concat([X_test,y_test], axis=1)\n", + "full_test_data\n", + "\n", + "# Using knn predict to predict result and show it in one df\n", + "full_test_data[\"test_prediction\"] = knn.predict(X_test)\n", + "full_test_data[[\"class\",\"test_prediction\"]]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "fbda1bd4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9777777777777777" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# checking the prediction accuracy using Knn score method for accuracy\n", + "knn.score(X_test,y_test)" ] }, { @@ -365,7 +2957,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "dsi_participant", "language": "python", "name": "python3" }, @@ -379,12 +2971,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.9.15" } }, "nbformat": 4, diff --git a/02_activities/assignments/assignment_1_cohort 4.ipynb b/02_activities/assignments/assignment_1_cohort 4.ipynb new file mode 100644 index 000000000..0e2091c8c --- /dev/null +++ b/02_activities/assignments/assignment_1_cohort 4.ipynb @@ -0,0 +1,455 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7b0bcac6-5086-4f4e-928a-570a9ff7ae58", + "metadata": {}, + "source": [ + "# Assignment 1" + ] + }, + { + "cell_type": "markdown", + "id": "5fce0350-2a17-4e93-8d4c-0b8748fdfc32", + "metadata": {}, + "source": [ + "You only need to write one line of code for each question. When answering questions that ask you to identify or interpret something, the length of your response doesn’t matter. For example, if the answer is just ‘yes,’ ‘no,’ or a number, you can just give that answer without adding anything else.\n", + "\n", + "We will go through comparable code and concepts in the live learning session. If you run into trouble, start by using the help `help()` function in Python, to get information about the datasets and function in question. The internet is also a great resource when coding (though note that **no outside searches are required by the assignment!**). If you do incorporate code from the internet, please cite the source within your code (providing a URL is sufficient).\n", + "\n", + "Please bring questions that you cannot work out on your own to office hours, work periods or share with your peers on Slack. We will work with you through the issue." + ] + }, + { + "cell_type": "markdown", + "id": "5fc5001c-7715-4ebe-b0f7-e4bd04349629", + "metadata": {}, + "source": [ + "### Classification using KNN\n", + "\n", + "Let's set up our workspace and use the **Wine dataset** from `scikit-learn`. This dataset contains 178 wine samples with 13 chemical features, used to classify wines into different classes based on their origin.\n", + "\n", + "The **response variable** is `class`, which indicates the type of wine. We'll use all of the chemical features to predict this response variable." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4a3485d6-ba58-4660-a983-5680821c5719", + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'pandas'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[6], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Import standard libraries\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrandom\u001b[39;00m\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'pandas'" + ] + } + ], + "source": [ + "# Import standard libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import recall_score, precision_score\n", + "from sklearn.model_selection import cross_validate\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'sklearn'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_wine\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Load the Wine dataset\u001b[39;00m\n\u001b[0;32m 4\u001b[0m wine_data \u001b[38;5;241m=\u001b[39m load_wine()\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'" + ] + } + ], + "source": [ + "from sklearn.datasets import load_wine\n", + "\n", + "# Load the Wine dataset\n", + "wine_data = load_wine()\n", + "\n", + "# Convert to DataFrame\n", + "wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\n", + "\n", + "# Bind the 'class' (wine target) to the DataFrame\n", + "wine_df['class'] = wine_data.target\n", + "\n", + "# Display the DataFrame\n", + "wine_df\n" + ] + }, + { + "cell_type": "markdown", + "id": "721b2b17", + "metadata": {}, + "source": [ + "#### **Question 1:** \n", + "#### Data inspection\n", + "\n", + "Before fitting any model, it is essential to understand our data. **Use Python code** to answer the following questions about the **Wine dataset**:\n", + "\n", + "_(i)_ How many observations (rows) does the dataset contain?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56916892", + "metadata": {}, + "outputs": [], + "source": [ + "# Your answer here" + ] + }, + { + "cell_type": "markdown", + "id": "f7573b59", + "metadata": {}, + "source": [ + "_(ii)_ How many variables (columns) does the dataset contain?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df0ef103", + "metadata": {}, + "outputs": [], + "source": [ + "# Your answer here" + ] + }, + { + "cell_type": "markdown", + "id": "cb5180c7", + "metadata": {}, + "source": [ + "_(iii)_ What is the 'variable type' of the response variable `class` (e.g., 'integer', 'category', etc.)? What are the 'levels' (unique values) of the variable?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47989426", + "metadata": {}, + "outputs": [], + "source": [ + "# Your answer here" + ] + }, + { + "cell_type": "markdown", + "id": "a25f5e1b", + "metadata": {}, + "source": [ + "\n", + "_(iv)_ How many predictor variables do we have (Hint: all variables other than `class`)? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd7b0910", + "metadata": {}, + "outputs": [], + "source": [ + "# Your answer here" + ] + }, + { + "cell_type": "markdown", + "id": "d631e8e3", + "metadata": {}, + "source": [ + "You can use `print()` and `describe()` to help answer these questions." + ] + }, + { + "cell_type": "markdown", + "id": "fa3832d7", + "metadata": {}, + "source": [ + "#### **Question 2:** \n", + "#### Standardization and data-splitting\n", + "\n", + "Next, we must preform 'pre-processing' or 'data munging', to prepare our data for classification/prediction. For KNN, there are three essential steps. A first essential step is to 'standardize' the predictor variables. We can achieve this using the scaler method, provided as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cc899b59", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'wine_df' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[3], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Select predictors (excluding the last column)\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m predictors \u001b[38;5;241m=\u001b[39m \u001b[43mwine_df\u001b[49m\u001b[38;5;241m.\u001b[39miloc[:, :\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# Standardize the predictors\u001b[39;00m\n\u001b[0;32m 5\u001b[0m scaler \u001b[38;5;241m=\u001b[39m StandardScaler()\n", + "\u001b[1;31mNameError\u001b[0m: name 'wine_df' is not defined" + ] + } + ], + "source": [ + "# Select predictors (excluding the last column)\n", + "predictors = wine_df.iloc[:, :-1]\n", + "\n", + "# Standardize the predictors\n", + "scaler = StandardScaler()\n", + "predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)\n", + "\n", + "# Display the head of the standardized predictors\n", + "print(predictors_standardized.head())" + ] + }, + { + "cell_type": "markdown", + "id": "9981ca48", + "metadata": {}, + "source": [ + "(i) Why is it important to standardize the predictor variables?" + ] + }, + { + "cell_type": "markdown", + "id": "403ef0bb", + "metadata": {}, + "source": [ + "> Your answer here..." + ] + }, + { + "cell_type": "markdown", + "id": "8e2e1bea", + "metadata": {}, + "source": [ + "(ii) Why did we elect not to standard our response variable `Class`?" + ] + }, + { + "cell_type": "markdown", + "id": "fdee5a15", + "metadata": {}, + "source": [ + "> Your answer here..." + ] + }, + { + "cell_type": "markdown", + "id": "8077ec21", + "metadata": {}, + "source": [ + "(iii) A second essential step is to set a random seed. Do so below (Hint: use the random.seed function). Why is setting a seed important? Is the particular seed value important? Why or why not?" + ] + }, + { + "cell_type": "markdown", + "id": "f0676c21", + "metadata": {}, + "source": [ + "Your answer here..." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "df9de570", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'np' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[4], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mnp\u001b[49m\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;241m100\u001b[39m)\n", + "\u001b[1;31mNameError\u001b[0m: name 'np' is not defined" + ] + } + ], + "source": [ + "np.random.seed(100)" + ] + }, + { + "cell_type": "markdown", + "id": "36ab9229", + "metadata": {}, + "source": [ + "(iv) A third essential step is to split our standardized data into separate training and testing sets. We will split into 75% training and 25% testing. The provided code randomly partitions our data, and creates linked training sets for the predictors and response variables. \n", + "\n", + "Extend the code to create a non-overlapping test set for the predictors and response variables." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "72c101f2", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'np' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[5], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Do not touch\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43mnp\u001b[49m\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;241m123\u001b[39m)\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Create a random vector of True and False values to split the data\u001b[39;00m\n\u001b[0;32m 4\u001b[0m split \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mchoice([\u001b[38;5;28;01mTrue\u001b[39;00m, \u001b[38;5;28;01mFalse\u001b[39;00m], size\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(predictors_standardized), replace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, p\u001b[38;5;241m=\u001b[39m[\u001b[38;5;241m0.75\u001b[39m, \u001b[38;5;241m0.25\u001b[39m])\n", + "\u001b[1;31mNameError\u001b[0m: name 'np' is not defined" + ] + } + ], + "source": [ + "# Do not touch\n", + "np.random.seed(123)\n", + "# Create a random vector of True and False values to split the data\n", + "split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])" + ] + }, + { + "cell_type": "markdown", + "id": "4604ee03", + "metadata": {}, + "source": [ + "#### **Question 3:**\n", + "#### Model initialization and cross-validation\n", + "We are finally set to fit the KNN model. \n", + "\n", + "\n", + "Perform a grid search to tune the `n_neighbors` hyperparameter using 10-fold cross-validation. Follow these steps:\n", + "\n", + "1. Initialize the KNN classifier using `KNeighborsClassifier()`.\n", + "2. Define a parameter grid for `n_neighbors` ranging from 1 to 50.\n", + "3. Implement a grid search using `GridSearchCV` with 10-fold cross-validation to find the optimal number of neighbors.\n", + "4. After fitting the model on the training data, identify and return the best value for `n_neighbors` based on the grid search results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08818c64", + "metadata": {}, + "outputs": [], + "source": [ + "# Your code here..." + ] + }, + { + "cell_type": "markdown", + "id": "3f76bf62", + "metadata": {}, + "source": [ + "#### **Question 4:**\n", + "#### Model evaluation\n", + "\n", + "Using the best value for `n_neighbors`, fit a KNN model on the training data and evaluate its performance on the test set using `accuracy_score`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffefa9f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Your code here..." + ] + }, + { + "cell_type": "markdown", + "id": "6f8a69db", + "metadata": {}, + "source": [ + "# Criteria\n", + "\n", + "\n", + "| **Criteria** | **Complete** | **Incomplete** |\n", + "|--------------------------------------------------------|---------------------------------------------------|--------------------------------------------------|\n", + "| **Data Inspection** | Data is inspected for number of variables, observations and data types. | Data inspection is missing or incomplete. |\n", + "| **Data Scaling** | Data scaling or normalization is applied where necessary (e.g., using `StandardScaler`). | Data scaling or normalization is missing or incorrectly applied. |\n", + "| **Model Initialization** | The KNN model is correctly initialized and a random seed is set for reproducibility. | The KNN model is not initialized, is incorrect, or lacks a random seed for reproducibility. |\n", + "| **Parameter Grid for `n_neighbors`** | The parameter grid for `n_neighbors` is correctly defined. | The parameter grid is missing or incorrectly defined. |\n", + "| **Cross-Validation Setup** | Cross-validation is set up correctly with 10 folds. | Cross-validation is missing or incorrectly set up. |\n", + "| **Best Hyperparameter (`n_neighbors`) Selection** | The best value for `n_neighbors` is identified using the grid search results. | The best `n_neighbors` is not selected or incorrect. |\n", + "| **Model Evaluation on Test Data** | The model is evaluated on the test data using accuracy. | The model evaluation is missing or uses the wrong metric. |\n" + ] + }, + { + "cell_type": "markdown", + "id": "0b4390cc", + "metadata": {}, + "source": [ + "## Submission Information\n", + "\n", + "🚨 **Please review our [Assignment Submission Guide](https://github.com/UofT-DSI/onboarding/blob/main/onboarding_documents/submissions.md)** 🚨 for detailed instructions on how to format, branch, and submit your work. Following these guidelines is crucial for your submissions to be evaluated correctly.\n", + "\n", + "### Note:\n", + "\n", + "If you like, you may collaborate with others in the cohort. If you choose to do so, please indicate with whom you have worked with in your pull request by tagging their GitHub username. Separate submissions are required.\n", + "\n", + "### Submission Parameters:\n", + "* Submission Due Date: `HH:MM AM/PM - DD/MM/YYYY`\n", + "* The branch name for your repo should be: `assignment-1`\n", + "* What to submit for this assignment:\n", + " * This Jupyter Notebook (assignment_1.ipynb) should be populated and should be the only change in your pull request.\n", + "* What the pull request link should look like for this assignment: `https://github.com//applying_statistical_concepts/pull/`\n", + " * Open a private window in your browser. Copy and paste the link to your pull request into the address bar. Make sure you can see your pull request properly. This helps the technical facilitator and learning support staff review your submission easily.\n", + "\n", + "Checklist:\n", + "- [ ] Created a branch with the correct naming convention.\n", + "- [ ] Ensured that the repository is public.\n", + "- [ ] Reviewed the PR description guidelines and adhered to them.\n", + "- [ ] Verify that the link is accessible in a private browser window.\n", + "\n", + "If you encounter any difficulties or have questions, please don't hesitate to reach out to our team via our Slack at `#cohort-4-help`. Our Technical Facilitators and Learning Support staff are here to help you navigate any challenges.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/02_activities/assignments/assignment_1_ori.ipynb b/02_activities/assignments/assignment_1_ori.ipynb new file mode 100644 index 000000000..094ad69bc --- /dev/null +++ b/02_activities/assignments/assignment_1_ori.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7b0bcac6-5086-4f4e-928a-570a9ff7ae58", + "metadata": {}, + "source": [ + "# Assignment 1" + ] + }, + { + "cell_type": "markdown", + "id": "5fce0350-2a17-4e93-8d4c-0b8748fdfc32", + "metadata": {}, + "source": [ + "You only need to write one line of code for each question. When answering questions that ask you to identify or interpret something, the length of your response doesn’t matter. For example, if the answer is just ‘yes,’ ‘no,’ or a number, you can just give that answer without adding anything else.\n", + "\n", + "We will go through comparable code and concepts in the live learning session. If you run into trouble, start by using the help `help()` function in Python, to get information about the datasets and function in question. The internet is also a great resource when coding (though note that **no outside searches are required by the assignment!**). If you do incorporate code from the internet, please cite the source within your code (providing a URL is sufficient).\n", + "\n", + "Please bring questions that you cannot work out on your own to office hours, work periods or share with your peers on Slack. We will work with you through the issue." + ] + }, + { + "cell_type": "markdown", + "id": "5fc5001c-7715-4ebe-b0f7-e4bd04349629", + "metadata": {}, + "source": [ + "### Classification using KNN\n", + "\n", + "Let's set up our workspace and use the **Wine dataset** from `scikit-learn`. This dataset contains 178 wine samples with 13 chemical features, used to classify wines into different classes based on their origin.\n", + "\n", + "The **response variable** is `class`, which indicates the type of wine. We'll use all of the chemical features to predict this response variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a3485d6-ba58-4660-a983-5680821c5719", + "metadata": {}, + "outputs": [], + "source": [ + "# Import standard libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import recall_score, precision_score\n", + "from sklearn.model_selection import cross_validate\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_wine\n", + "\n", + "# Load the Wine dataset\n", + "wine_data = load_wine()\n", + "\n", + "# Convert to DataFrame\n", + "wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\n", + "\n", + "# Bind the 'class' (wine target) to the DataFrame\n", + "wine_df['class'] = wine_data.target\n", + "\n", + "# Display the DataFrame\n", + "wine_df\n" + ] + }, + { + "cell_type": "markdown", + "id": "721b2b17", + "metadata": {}, + "source": [ + "#### **Question 1:** \n", + "#### Data inspection\n", + "\n", + "Before fitting any model, it is essential to understand our data. **Use Python code** to answer the following questions about the **Wine dataset**:\n", + "\n", + "_(i)_ How many observations (rows) does the dataset contain?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56916892", + "metadata": {}, + "outputs": [], + "source": [ + "# Your answer here" + ] + }, + { + "cell_type": "markdown", + "id": "f7573b59", + "metadata": {}, + "source": [ + "_(ii)_ How many variables (columns) does the dataset contain?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df0ef103", + "metadata": {}, + "outputs": [], + "source": [ + "# Your answer here" + ] + }, + { + "cell_type": "markdown", + "id": "cb5180c7", + "metadata": {}, + "source": [ + "_(iii)_ What is the 'variable type' of the response variable `class` (e.g., 'integer', 'category', etc.)? What are the 'levels' (unique values) of the variable?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47989426", + "metadata": {}, + "outputs": [], + "source": [ + "# Your answer here" + ] + }, + { + "cell_type": "markdown", + "id": "a25f5e1b", + "metadata": {}, + "source": [ + "\n", + "_(iv)_ How many predictor variables do we have (Hint: all variables other than `class`)? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd7b0910", + "metadata": {}, + "outputs": [], + "source": [ + "# Your answer here" + ] + }, + { + "cell_type": "markdown", + "id": "d631e8e3", + "metadata": {}, + "source": [ + "You can use `print()` and `describe()` to help answer these questions." + ] + }, + { + "cell_type": "markdown", + "id": "fa3832d7", + "metadata": {}, + "source": [ + "#### **Question 2:** \n", + "#### Standardization and data-splitting\n", + "\n", + "Next, we must preform 'pre-processing' or 'data munging', to prepare our data for classification/prediction. For KNN, there are three essential steps. A first essential step is to 'standardize' the predictor variables. We can achieve this using the scaler method, provided as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc899b59", + "metadata": {}, + "outputs": [], + "source": [ + "# Select predictors (excluding the last column)\n", + "predictors = wine_df.iloc[:, :-1]\n", + "\n", + "# Standardize the predictors\n", + "scaler = StandardScaler()\n", + "predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)\n", + "\n", + "# Display the head of the standardized predictors\n", + "print(predictors_standardized.head())" + ] + }, + { + "cell_type": "markdown", + "id": "9981ca48", + "metadata": {}, + "source": [ + "(i) Why is it important to standardize the predictor variables?" + ] + }, + { + "cell_type": "markdown", + "id": "403ef0bb", + "metadata": {}, + "source": [ + "> Your answer here..." + ] + }, + { + "cell_type": "markdown", + "id": "8e2e1bea", + "metadata": {}, + "source": [ + "(ii) Why did we elect not to standard our response variable `Class`?" + ] + }, + { + "cell_type": "markdown", + "id": "fdee5a15", + "metadata": {}, + "source": [ + "> Your answer here..." + ] + }, + { + "cell_type": "markdown", + "id": "8077ec21", + "metadata": {}, + "source": [ + "(iii) A second essential step is to set a random seed. Do so below (Hint: use the random.seed function). Why is setting a seed important? Is the particular seed value important? Why or why not?" + ] + }, + { + "cell_type": "markdown", + "id": "f0676c21", + "metadata": {}, + "source": [ + "> Your answer here..." + ] + }, + { + "cell_type": "markdown", + "id": "36ab9229", + "metadata": {}, + "source": [ + "(iv) A third essential step is to split our standardized data into separate training and testing sets. We will split into 75% training and 25% testing. The provided code randomly partitions our data, and creates linked training sets for the predictors and response variables. \n", + "\n", + "Extend the code to create a non-overlapping test set for the predictors and response variables." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72c101f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Do not touch\n", + "np.random.seed(123)\n", + "# Create a random vector of True and False values to split the data\n", + "split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])" + ] + }, + { + "cell_type": "markdown", + "id": "4604ee03", + "metadata": {}, + "source": [ + "#### **Question 3:**\n", + "#### Model initialization and cross-validation\n", + "We are finally set to fit the KNN model. \n", + "\n", + "\n", + "Perform a grid search to tune the `n_neighbors` hyperparameter using 10-fold cross-validation. Follow these steps:\n", + "\n", + "1. Initialize the KNN classifier using `KNeighborsClassifier()`.\n", + "2. Define a parameter grid for `n_neighbors` ranging from 1 to 50.\n", + "3. Implement a grid search using `GridSearchCV` with 10-fold cross-validation to find the optimal number of neighbors.\n", + "4. After fitting the model on the training data, identify and return the best value for `n_neighbors` based on the grid search results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08818c64", + "metadata": {}, + "outputs": [], + "source": [ + "# Your code here..." + ] + }, + { + "cell_type": "markdown", + "id": "3f76bf62", + "metadata": {}, + "source": [ + "#### **Question 4:**\n", + "#### Model evaluation\n", + "\n", + "Using the best value for `n_neighbors`, fit a KNN model on the training data and evaluate its performance on the test set using `accuracy_score`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffefa9f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Your code here..." + ] + }, + { + "cell_type": "markdown", + "id": "6f8a69db", + "metadata": {}, + "source": [ + "# Criteria\n", + "\n", + "\n", + "| **Criteria** | **Complete** | **Incomplete** |\n", + "|--------------------------------------------------------|---------------------------------------------------|--------------------------------------------------|\n", + "| **Data Inspection** | Data is inspected for number of variables, observations and data types. | Data inspection is missing or incomplete. |\n", + "| **Data Scaling** | Data scaling or normalization is applied where necessary (e.g., using `StandardScaler`). | Data scaling or normalization is missing or incorrectly applied. |\n", + "| **Model Initialization** | The KNN model is correctly initialized and a random seed is set for reproducibility. | The KNN model is not initialized, is incorrect, or lacks a random seed for reproducibility. |\n", + "| **Parameter Grid for `n_neighbors`** | The parameter grid for `n_neighbors` is correctly defined. | The parameter grid is missing or incorrectly defined. |\n", + "| **Cross-Validation Setup** | Cross-validation is set up correctly with 10 folds. | Cross-validation is missing or incorrectly set up. |\n", + "| **Best Hyperparameter (`n_neighbors`) Selection** | The best value for `n_neighbors` is identified using the grid search results. | The best `n_neighbors` is not selected or incorrect. |\n", + "| **Model Evaluation on Test Data** | The model is evaluated on the test data using accuracy. | The model evaluation is missing or uses the wrong metric. |\n" + ] + }, + { + "cell_type": "markdown", + "id": "0b4390cc", + "metadata": {}, + "source": [ + "## Submission Information\n", + "\n", + "🚨 **Please review our [Assignment Submission Guide](https://github.com/UofT-DSI/onboarding/blob/main/onboarding_documents/submissions.md)** 🚨 for detailed instructions on how to format, branch, and submit your work. Following these guidelines is crucial for your submissions to be evaluated correctly.\n", + "\n", + "### Note:\n", + "\n", + "If you like, you may collaborate with others in the cohort. If you choose to do so, please indicate with whom you have worked with in your pull request by tagging their GitHub username. Separate submissions are required.\n", + "\n", + "### Submission Parameters:\n", + "* Submission Due Date: `11:59 PM - 01/12/2025`\n", + "* The branch name for your repo should be: `assignment-1`\n", + "* What to submit for this assignment:\n", + " * This Jupyter Notebook (assignment_1.ipynb) should be populated and should be the only change in your pull request.\n", + "* What the pull request link should look like for this assignment: `https://github.com//LCR/pull/`\n", + " * Open a private window in your browser. Copy and paste the link to your pull request into the address bar. Make sure you can see your pull request properly. This helps the technical facilitator and learning support staff review your submission easily.\n", + "\n", + "Checklist:\n", + "- [ ] Created a branch with the correct naming convention.\n", + "- [ ] Ensured that the repository is public.\n", + "- [ ] Reviewed the PR description guidelines and adhered to them.\n", + "- [ ] Verify that the link is accessible in a private browser window.\n", + "\n", + "If you encounter any difficulties or have questions, please don't hesitate to reach out to our team via our Slack at `#cohort-4-help`. Our Technical Facilitators and Learning Support staff are here to help you navigate any challenges.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dsi_participant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/02_activities/assignments/assignment_1_test2_1.ipynb b/02_activities/assignments/assignment_1_test2_1.ipynb new file mode 100644 index 000000000..c1e2b6a7f --- /dev/null +++ b/02_activities/assignments/assignment_1_test2_1.ipynb @@ -0,0 +1,3335 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7b0bcac6-5086-4f4e-928a-570a9ff7ae58", + "metadata": {}, + "source": [ + "# Assignment 1" + ] + }, + { + "cell_type": "markdown", + "id": "5fce0350-2a17-4e93-8d4c-0b8748fdfc32", + "metadata": {}, + "source": [ + "You only need to write one line of code for each question. When answering questions that ask you to identify or interpret something, the length of your response doesn’t matter. For example, if the answer is just ‘yes,’ ‘no,’ or a number, you can just give that answer without adding anything else.\n", + "\n", + "We will go through comparable code and concepts in the live learning session. If you run into trouble, start by using the help `help()` function in Python, to get information about the datasets and function in question. The internet is also a great resource when coding (though note that **no outside searches are required by the assignment!**). If you do incorporate code from the internet, please cite the source within your code (providing a URL is sufficient).\n", + "\n", + "Please bring questions that you cannot work out on your own to office hours, work periods or share with your peers on Slack. We will work with you through the issue." + ] + }, + { + "cell_type": "markdown", + "id": "5fc5001c-7715-4ebe-b0f7-e4bd04349629", + "metadata": {}, + "source": [ + "### Classification using KNN\n", + "\n", + "Let's set up our workspace and use the **Wine dataset** from `scikit-learn`. This dataset contains 178 wine samples with 13 chemical features, used to classify wines into different classes based on their origin.\n", + "\n", + "The **response variable** is `class`, which indicates the type of wine. We'll use all of the chemical features to predict this response variable." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4a3485d6-ba58-4660-a983-5680821c5719", + "metadata": {}, + "outputs": [], + "source": [ + "# Import standard libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import recall_score, precision_score\n", + "from sklearn.model_selection import cross_validate\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_wine\n", + "\n", + "# Load the Wine dataset\n", + "wine_data = load_wine()\n", + "\n", + "# Convert to DataFrame\n", + "wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\n", + "\n", + "# Bind the 'class' (wine target) to the DataFrame\n", + "wine_df['class'] = wine_data.target\n", + "\n", + "# Display the DataFrame\n", + "wine_df\n" + ] + }, + { + "cell_type": "markdown", + "id": "721b2b17", + "metadata": {}, + "source": [ + "#### **Question 1:** \n", + "#### Data inspection\n", + "\n", + "Before fitting any model, it is essential to understand our data. **Use Python code** to answer the following questions about the **Wine dataset**:\n", + "\n", + "_(i)_ How many observations (rows) does the dataset contain?" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "56916892", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "178" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# count number of rows\n", + "wine_df.shape[0]" + ] + }, + { + "cell_type": "markdown", + "id": "f7573b59", + "metadata": {}, + "source": [ + "_(ii)_ How many variables (columns) does the dataset contain?" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "df0ef103", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# count number of columns\n", + "wine_df.shape[1]" + ] + }, + { + "cell_type": "markdown", + "id": "cb5180c7", + "metadata": {}, + "source": [ + "_(iii)_ What is the 'variable type' of the response variable `class` (e.g., 'integer', 'category', etc.)? What are the 'levels' (unique values) of the variable?" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "47989426", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('int32')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# data type of column class\n", + "wine_df.dtypes['class']" + ] + }, + { + "cell_type": "markdown", + "id": "a25f5e1b", + "metadata": {}, + "source": [ + "\n", + "_(iv)_ How many predictor variables do we have (Hint: all variables other than `class`)? " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bd7b0910", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 178 entries, 0 to 177\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 178 non-null float64\n", + " 1 malic_acid 178 non-null float64\n", + " 2 ash 178 non-null float64\n", + " 3 alcalinity_of_ash 178 non-null float64\n", + " 4 magnesium 178 non-null float64\n", + " 5 total_phenols 178 non-null float64\n", + " 6 flavanoids 178 non-null float64\n", + " 7 nonflavanoid_phenols 178 non-null float64\n", + " 8 proanthocyanins 178 non-null float64\n", + " 9 color_intensity 178 non-null float64\n", + " 10 hue 178 non-null float64\n", + " 11 od280/od315_of_diluted_wines 178 non-null float64\n", + " 12 proline 178 non-null float64\n", + " 13 class 178 non-null int32 \n", + "dtypes: float64(13), int32(1)\n", + "memory usage: 18.9 KB\n" + ] + } + ], + "source": [ + "# Number of predictor variables is 12\n", + "wine_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "156cc83a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
count178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000
mean13.0006182.3363482.36651719.49494499.7415732.2951122.0292700.3618541.5908995.0580900.9574492.611685746.8932580.938202
std0.8118271.1171460.2743443.33956414.2824840.6258510.9988590.1244530.5723592.3182860.2285720.709990314.9074740.775035
min11.0300000.7400001.36000010.60000070.0000000.9800000.3400000.1300000.4100001.2800000.4800001.270000278.0000000.000000
25%12.3625001.6025002.21000017.20000088.0000001.7425001.2050000.2700001.2500003.2200000.7825001.937500500.5000000.000000
50%13.0500001.8650002.36000019.50000098.0000002.3550002.1350000.3400001.5550004.6900000.9650002.780000673.5000001.000000
75%13.6775003.0825002.55750021.500000107.0000002.8000002.8750000.4375001.9500006.2000001.1200003.170000985.0000002.000000
max14.8300005.8000003.23000030.000000162.0000003.8800005.0800000.6600003.58000013.0000001.7100004.0000001680.0000002.000000
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "count 178.000000 178.000000 178.000000 178.000000 178.000000 \n", + "mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n", + "std 0.811827 1.117146 0.274344 3.339564 14.282484 \n", + "min 11.030000 0.740000 1.360000 10.600000 70.000000 \n", + "25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n", + "50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n", + "75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n", + "max 14.830000 5.800000 3.230000 30.000000 162.000000 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 2.295112 2.029270 0.361854 1.590899 \n", + "std 0.625851 0.998859 0.124453 0.572359 \n", + "min 0.980000 0.340000 0.130000 0.410000 \n", + "25% 1.742500 1.205000 0.270000 1.250000 \n", + "50% 2.355000 2.135000 0.340000 1.555000 \n", + "75% 2.800000 2.875000 0.437500 1.950000 \n", + "max 3.880000 5.080000 0.660000 3.580000 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 5.058090 0.957449 2.611685 746.893258 \n", + "std 2.318286 0.228572 0.709990 314.907474 \n", + "min 1.280000 0.480000 1.270000 278.000000 \n", + "25% 3.220000 0.782500 1.937500 500.500000 \n", + "50% 4.690000 0.965000 2.780000 673.500000 \n", + "75% 6.200000 1.120000 3.170000 985.000000 \n", + "max 13.000000 1.710000 4.000000 1680.000000 \n", + "\n", + " class \n", + "count 178.000000 \n", + "mean 0.938202 \n", + "std 0.775035 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 1.000000 \n", + "75% 2.000000 \n", + "max 2.000000 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wine_df.describe()" + ] + }, + { + "cell_type": "markdown", + "id": "d631e8e3", + "metadata": {}, + "source": [ + "You can use `print()` and `describe()` to help answer these questions." + ] + }, + { + "cell_type": "markdown", + "id": "fa3832d7", + "metadata": {}, + "source": [ + "#### **Question 2:** \n", + "#### Standardization and data-splitting\n", + "\n", + "Next, we must preform 'pre-processing' or 'data munging', to prepare our data for classification/prediction. For KNN, there are three essential steps. A first essential step is to 'standardize' the predictor variables. We can achieve this using the scaler method, provided as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "cc899b59", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], + "source": [ + "# Select predictors (excluding the last column)\n", + "predictors = wine_df.iloc[:, :-1]\n", + "\n", + "# Standardize the predictors\n", + "scaler = StandardScaler()\n", + "predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)\n", + "\n", + "# Display the head of the standardized predictors\n", + "print(predictors_standardized.head())" + ] + }, + { + "cell_type": "markdown", + "id": "9981ca48", + "metadata": {}, + "source": [ + "(i) Why is it important to standardize the predictor variables?" + ] + }, + { + "cell_type": "markdown", + "id": "403ef0bb", + "metadata": {}, + "source": [ + "> To make sure all the predictor valiables to have same scale therefore none of them will be dominated because of large scale and skew the classifcation result when using machine learning models that rely on distance metrics." + ] + }, + { + "cell_type": "markdown", + "id": "8e2e1bea", + "metadata": {}, + "source": [ + "(ii) Why did we elect not to standard our response variable `Class`?" + ] + }, + { + "cell_type": "markdown", + "id": "fdee5a15", + "metadata": {}, + "source": [ + "> This is the variable we want to determine through the model and its scale would not affect the classification result " + ] + }, + { + "cell_type": "markdown", + "id": "8077ec21", + "metadata": {}, + "source": [ + "(iii) A second essential step is to set a random seed. Do so below (Hint: use the random.seed function). Why is setting a seed important? Is the particular seed value important? Why or why not?" + ] + }, + { + "cell_type": "markdown", + "id": "f0676c21", + "metadata": {}, + "source": [ + "> Setting random seeed is important because it allow us to control the randomness in our code. Therefore we can repoduce the same result after running the code and do comparison or testing." + ] + }, + { + "cell_type": "markdown", + "id": "36ab9229", + "metadata": {}, + "source": [ + "(iv) A third essential step is to split our standardized data into separate training and testing sets. We will split into 75% training and 25% testing. The provided code randomly partitions our data, and creates linked training sets for the predictors and response variables. \n", + "\n", + "Extend the code to create a non-overlapping test set for the predictors and response variables." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8e4a9dda", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 133 entries, 78 to 66\n", + "Series name: class\n", + "Non-Null Count Dtype\n", + "-------------- -----\n", + "133 non-null int32\n", + "dtypes: int32(1)\n", + "memory usage: 1.6 KB\n" + ] + } + ], + "source": [ + "# set a seed for reproducibility\n", + "np.random.seed(123)\n", + "# split the data into a training and testing set. hint: use train_test_split !\n", + "# Your code here ...\n", + "\n", + "# concatnate the standardized predictor and response variable into one dataframe \n", + "full_std_data = pd.concat([predictors_standardized,wine_df['class']], axis=1)\n", + "full_std_data\n", + "\n", + "# split the data into training and testing set\n", + "full_std_train, full_std_test = train_test_split(\n", + " full_std_data, train_size=0.75, shuffle= True,\n", + " stratify=full_std_data[\"class\"], \n", + " random_state= 123\n", + ")\n", + "\n", + "## set variable to retrieve train and test data from predictor variable df and reponse variable df\n", + "full_std_train\n", + "\n", + "std_train_x = full_std_train.iloc[:,:-1]\n", + "std_train_x\n", + "\n", + "std_test_x = full_std_test.iloc[:,:-1]\n", + "std_test_x\n", + "\n", + "std_train_y = full_std_train ['class']\n", + "std_train_y.info()\n", + "\n", + "std_test_y = full_std_test ['class']\n", + "#std_test_y.info()\n", + "\n", + "# output show std_train_y has 133 entries and std_test_y has 45 entries. concfirm it is a 0.75/0.25 split" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d7c21a90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
01.518613-0.5622500.232053-1.1695931.9139050.8089971.034819-0.6595631.2248840.2517170.3621771.8479201.0130090
10.246290-0.499413-0.827996-2.4908470.0181450.5686480.733629-0.820719-0.544721-0.2933210.4060511.1134490.9652420
20.1968790.0212311.109334-0.2687380.0883580.8089971.215533-0.4984072.1359680.2690200.3183040.7885871.3951480
31.691550-0.3468110.487926-0.8092510.9309182.4914461.466525-0.9818751.0321551.186068-0.4275441.1840712.3345740
40.2957000.2276941.8404030.4519461.2819850.8089970.6633510.2267960.401404-0.3192760.3621770.449601-0.0378740
.............................................
1730.8762752.9745430.3051590.301803-0.332922-0.985614-1.4249001.274310-0.9301791.142811-1.392758-1.231206-0.0219522
1740.4933431.4126090.4148201.0525160.158572-0.793334-1.2843440.549108-0.3169500.969783-1.129518-1.4854450.0098932
1750.3327581.744744-0.3893550.1516611.422412-1.129824-1.3445820.549108-0.4220752.224236-1.612125-1.4854450.2805752
1760.2092320.2276940.0127320.1516611.422412-1.033684-1.3546221.354888-0.2293461.834923-1.568252-1.4006990.2964982
1771.3950861.5831651.3652081.502943-0.262708-0.392751-1.2743051.596623-0.4220751.791666-1.524378-1.428948-0.5951602
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + ".. ... ... ... ... ... \n", + "173 0.876275 2.974543 0.305159 0.301803 -0.332922 \n", + "174 0.493343 1.412609 0.414820 1.052516 0.158572 \n", + "175 0.332758 1.744744 -0.389355 0.151661 1.422412 \n", + "176 0.209232 0.227694 0.012732 0.151661 1.422412 \n", + "177 1.395086 1.583165 1.365208 1.502943 -0.262708 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + ".. ... ... ... ... \n", + "173 -0.985614 -1.424900 1.274310 -0.930179 \n", + "174 -0.793334 -1.284344 0.549108 -0.316950 \n", + "175 -1.129824 -1.344582 0.549108 -0.422075 \n", + "176 -1.033684 -1.354622 1.354888 -0.229346 \n", + "177 -0.392751 -1.274305 1.596623 -0.422075 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline class \n", + "0 0.251717 0.362177 1.847920 1.013009 0 \n", + "1 -0.293321 0.406051 1.113449 0.965242 0 \n", + "2 0.269020 0.318304 0.788587 1.395148 0 \n", + "3 1.186068 -0.427544 1.184071 2.334574 0 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 0 \n", + ".. ... ... ... ... ... \n", + "173 1.142811 -1.392758 -1.231206 -0.021952 2 \n", + "174 0.969783 -1.129518 -1.485445 0.009893 2 \n", + "175 2.224236 -1.612125 -1.485445 0.280575 2 \n", + "176 1.834923 -1.568252 -1.400699 0.296498 2 \n", + "177 1.791666 -1.524378 -1.428948 -0.595160 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# display full_std_data\n", + "full_std_data" + ] + }, + { + "cell_type": "markdown", + "id": "4604ee03", + "metadata": {}, + "source": [ + "#### **Question 3:**\n", + "#### Model initialization and cross-validation\n", + "We are finally set to fit the KNN model. \n", + "\n", + "\n", + "Perform a grid search to tune the `n_neighbors` hyperparameter using 10-fold cross-validation. Follow these steps:\n", + "\n", + "1. Initialize the KNN classifier using `KNeighborsClassifier()`.\n", + "2. Define a parameter grid for `n_neighbors` ranging from 1 to 50.\n", + "3. Implement a grid search using `GridSearchCV` with 10-fold cross-validation to find the optimal number of neighbors.\n", + "4. After fitting the model on the training data, identify and return the best value for `n_neighbors` based on the grid search results." + ] + }, + { + "cell_type": "markdown", + "id": "905ed370", + "metadata": {}, + "source": [ + "Question 3 - point 1" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "08818c64", + "metadata": {}, + "outputs": [], + "source": [ + "# initiate KNN\n", + "knn = KNeighborsClassifier(n_neighbors=5)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "42b204f2", + "metadata": {}, + "outputs": [], + "source": [ + "# define x and y for KNN trainig\n", + "X1 = std_train_x\n", + "y1 = std_train_y" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "44a9ab17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier()" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fitting KNN\n", + "knn.fit(X1,y1)" + ] + }, + { + "cell_type": "markdown", + "id": "9ffb8bf8", + "metadata": {}, + "source": [ + "Question 3 point 2" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "58c21754", + "metadata": {}, + "outputs": [], + "source": [ + "# implementing a gridSearch , define pararmeter grid, riging from 1 to 50\n", + "parameter_grid = {\n", + " \"n_neighbors\": range(1, 50, 3),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "ddf8185b", + "metadata": {}, + "source": [ + "Question 3 point 3 " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "b3ad80ad", + "metadata": {}, + "outputs": [], + "source": [ + "# use function to search best K -- implementing a gridSearch \n", + "wine_tune_grid = GridSearchCV(\n", + " estimator=knn,\n", + " param_grid=parameter_grid,\n", + " cv=10\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "672c7471", + "metadata": {}, + "source": [ + "Question 3 - point 4" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "9fcf66a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n",
+       "             param_grid={'n_neighbors': range(1, 50, 3)})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n", + " param_grid={'n_neighbors': range(1, 50, 3)})" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fitting the x and y\n", + "wine_tune_grid.fit(\n", + " X1,\n", + " y1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "e0cca0de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paramsmean_test_score
0{'n_neighbors': 1}0.954396
1{'n_neighbors': 4}0.954945
2{'n_neighbors': 7}0.977473
3{'n_neighbors': 10}0.954396
4{'n_neighbors': 13}0.977473
5{'n_neighbors': 16}0.962637
6{'n_neighbors': 19}0.962637
7{'n_neighbors': 22}0.970330
8{'n_neighbors': 25}0.954945
9{'n_neighbors': 28}0.962637
10{'n_neighbors': 31}0.955495
11{'n_neighbors': 34}0.963187
12{'n_neighbors': 37}0.962637
13{'n_neighbors': 40}0.954945
14{'n_neighbors': 43}0.954945
15{'n_neighbors': 46}0.947253
16{'n_neighbors': 49}0.947253
\n", + "
" + ], + "text/plain": [ + " params mean_test_score\n", + "0 {'n_neighbors': 1} 0.954396\n", + "1 {'n_neighbors': 4} 0.954945\n", + "2 {'n_neighbors': 7} 0.977473\n", + "3 {'n_neighbors': 10} 0.954396\n", + "4 {'n_neighbors': 13} 0.977473\n", + "5 {'n_neighbors': 16} 0.962637\n", + "6 {'n_neighbors': 19} 0.962637\n", + "7 {'n_neighbors': 22} 0.970330\n", + "8 {'n_neighbors': 25} 0.954945\n", + "9 {'n_neighbors': 28} 0.962637\n", + "10 {'n_neighbors': 31} 0.955495\n", + "11 {'n_neighbors': 34} 0.963187\n", + "12 {'n_neighbors': 37} 0.962637\n", + "13 {'n_neighbors': 40} 0.954945\n", + "14 {'n_neighbors': 43} 0.954945\n", + "15 {'n_neighbors': 46} 0.947253\n", + "16 {'n_neighbors': 49} 0.947253" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check out the accuracy\n", + "accuracies_grid = pd.DataFrame(wine_tune_grid.cv_results_)\n", + "#accuracies_grid\n", + "accuracies_grid [[\"params\",\"mean_test_score\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "369cdf3b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_neighbors': 7}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# optimal number of neighbours\n", + "wine_tune_grid.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "aa45b949", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create the plot\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "\n", + "# Plot mean test scores with error bars\n", + "plt.plot(accuracies_grid['param_n_neighbors'], accuracies_grid['mean_test_score'], '-o', color='blue')\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Number of Neighbors')\n", + "plt.ylabel('Accuracy estimate')\n", + "plt.title('K-Nearest Neighbors Performance')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "3f76bf62", + "metadata": {}, + "source": [ + "#### **Question 4:**\n", + "#### Model evaluation\n", + "\n", + "Using the best value for `n_neighbors`, fit a KNN model on the training data and evaluate its performance on the test set using `accuracy_score`." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "ffefa9f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier(n_neighbors=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier(n_neighbors=7)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Initiate the KNN with the best 'n_neighbour' found\n", + "knn = KNeighborsClassifier(n_neighbors=wine_tune_grid.best_params_['n_neighbors'])\n", + "\n", + "# define x and y for KNN using the test data now to find prediction\n", + "X2 = std_test_x\n", + "y2 = std_test_y\n", + "\n", + "# fitting KNN into the test set\n", + "knn.fit(X2,y2)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "4881b3d5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classtest_prediction
10211
8411
9611
6511
7911
1700
10911
11311
2800
15922
3800
3400
12511
11511
7111
7611
13122
3300
6011
1900
11411
4700
4800
15822
13322
13722
15422
13622
200
16822
11711
3200
2200
10811
7310
7711
14222
900
8511
5800
4500
17522
4200
14322
17722
\n", + "
" + ], + "text/plain": [ + " class test_prediction\n", + "102 1 1\n", + "84 1 1\n", + "96 1 1\n", + "65 1 1\n", + "79 1 1\n", + "17 0 0\n", + "109 1 1\n", + "113 1 1\n", + "28 0 0\n", + "159 2 2\n", + "38 0 0\n", + "34 0 0\n", + "125 1 1\n", + "115 1 1\n", + "71 1 1\n", + "76 1 1\n", + "131 2 2\n", + "33 0 0\n", + "60 1 1\n", + "19 0 0\n", + "114 1 1\n", + "47 0 0\n", + "48 0 0\n", + "158 2 2\n", + "133 2 2\n", + "137 2 2\n", + "154 2 2\n", + "136 2 2\n", + "2 0 0\n", + "168 2 2\n", + "117 1 1\n", + "32 0 0\n", + "22 0 0\n", + "108 1 1\n", + "73 1 0\n", + "77 1 1\n", + "142 2 2\n", + "9 0 0\n", + "85 1 1\n", + "58 0 0\n", + "45 0 0\n", + "175 2 2\n", + "42 0 0\n", + "143 2 2\n", + "177 2 2" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# incorporate the test prediction into the test data set and compare\n", + "\n", + "\n", + "full_std_test[\"test_prediction\"] = knn.predict(X2)\n", + "full_std_test[[\"class\",\"test_prediction\"]]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "deae09fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "102 1\n", + "84 1\n", + "96 1\n", + "65 1\n", + "79 1\n", + "17 0\n", + "109 1\n", + "113 1\n", + "28 0\n", + "159 2\n", + "38 0\n", + "34 0\n", + "125 1\n", + "115 1\n", + "71 1\n", + "76 1\n", + "131 2\n", + "33 0\n", + "60 1\n", + "19 0\n", + "114 1\n", + "47 0\n", + "48 0\n", + "158 2\n", + "133 2\n", + "137 2\n", + "154 2\n", + "136 2\n", + "2 0\n", + "168 2\n", + "117 1\n", + "32 0\n", + "22 0\n", + "108 1\n", + "73 1\n", + "77 1\n", + "142 2\n", + "9 0\n", + "85 1\n", + "58 0\n", + "45 0\n", + "175 2\n", + "42 0\n", + "143 2\n", + "177 2\n", + "Name: class, dtype: int32" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#other method\n", + "\n", + "test_prediction = knn.predict(std_test_x)\n", + "test_prediction\n", + "std_test_y" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "ed8d4939", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9777777777777777" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# checking the prediction accuracy using Knn score method or accuracy_score \n", + "knn.score(X2,y2)\n", + "\n", + "#accuracy_score(X2,y2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "1a401690", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9777777777777777" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#other method\n", + "accuracy_score(test_prediction,std_test_y)\n" + ] + }, + { + "cell_type": "markdown", + "id": "6f8a69db", + "metadata": {}, + "source": [ + "# Criteria\n", + "\n", + "\n", + "| **Criteria** | **Complete** | **Incomplete** |\n", + "|--------------------------------------------------------|---------------------------------------------------|--------------------------------------------------|\n", + "| **Data Inspection** | Data is inspected for number of variables, observations and data types. | Data inspection is missing or incomplete. |\n", + "| **Data Scaling** | Data scaling or normalization is applied where necessary (e.g., using `StandardScaler`). | Data scaling or normalization is missing or incorrectly applied. |\n", + "| **Model Initialization** | The KNN model is correctly initialized and a random seed is set for reproducibility. | The KNN model is not initialized, is incorrect, or lacks a random seed for reproducibility. |\n", + "| **Parameter Grid for `n_neighbors`** | The parameter grid for `n_neighbors` is correctly defined. | The parameter grid is missing or incorrectly defined. |\n", + "| **Cross-Validation Setup** | Cross-validation is set up correctly with 10 folds. | Cross-validation is missing or incorrectly set up. |\n", + "| **Best Hyperparameter (`n_neighbors`) Selection** | The best value for `n_neighbors` is identified using the grid search results. | The best `n_neighbors` is not selected or incorrect. |\n", + "| **Model Evaluation on Test Data** | The model is evaluated on the test data using accuracy. | The model evaluation is missing or uses the wrong metric. |\n" + ] + }, + { + "cell_type": "markdown", + "id": "0b4390cc", + "metadata": {}, + "source": [ + "## Submission Information\n", + "\n", + "🚨 **Please review our [Assignment Submission Guide](https://github.com/UofT-DSI/onboarding/blob/main/onboarding_documents/submissions.md)** 🚨 for detailed instructions on how to format, branch, and submit your work. Following these guidelines is crucial for your submissions to be evaluated correctly.\n", + "\n", + "### Note:\n", + "\n", + "If you like, you may collaborate with others in the cohort. If you choose to do so, please indicate with whom you have worked with in your pull request by tagging their GitHub username. Separate submissions are required.\n", + "\n", + "### Submission Parameters:\n", + "* Submission Due Date: `11:59 PM - 01/12/2025`\n", + "* The branch name for your repo should be: `assignment-1`\n", + "* What to submit for this assignment:\n", + " * This Jupyter Notebook (assignment_1.ipynb) should be populated and should be the only change in your pull request.\n", + "* What the pull request link should look like for this assignment: `https://github.com//LCR/pull/`\n", + " * Open a private window in your browser. Copy and paste the link to your pull request into the address bar. Make sure you can see your pull request properly. This helps the technical facilitator and learning support staff review your submission easily.\n", + "\n", + "Checklist:\n", + "- [ ] Created a branch with the correct naming convention.\n", + "- [ ] Ensured that the repository is public.\n", + "- [ ] Reviewed the PR description guidelines and adhered to them.\n", + "- [ ] Verify that the link is accessible in a private browser window.\n", + "\n", + "If you encounter any difficulties or have questions, please don't hesitate to reach out to our team via our Slack at `#cohort-4-help`. Our Technical Facilitators and Learning Support staff are here to help you navigate any challenges.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dsi_participant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/02_activities/assignments/assignment_1_using lesson example.ipynb b/02_activities/assignments/assignment_1_using lesson example.ipynb new file mode 100644 index 000000000..ed1fc91e9 --- /dev/null +++ b/02_activities/assignments/assignment_1_using lesson example.ipynb @@ -0,0 +1,2661 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7b0bcac6-5086-4f4e-928a-570a9ff7ae58", + "metadata": {}, + "source": [ + "# Assignment 1" + ] + }, + { + "cell_type": "markdown", + "id": "5fce0350-2a17-4e93-8d4c-0b8748fdfc32", + "metadata": {}, + "source": [ + "You only need to write one line of code for each question. When answering questions that ask you to identify or interpret something, the length of your response doesn’t matter. For example, if the answer is just ‘yes,’ ‘no,’ or a number, you can just give that answer without adding anything else.\n", + "\n", + "We will go through comparable code and concepts in the live learning session. If you run into trouble, start by using the help `help()` function in Python, to get information about the datasets and function in question. The internet is also a great resource when coding (though note that **no outside searches are required by the assignment!**). If you do incorporate code from the internet, please cite the source within your code (providing a URL is sufficient).\n", + "\n", + "Please bring questions that you cannot work out on your own to office hours, work periods or share with your peers on Slack. We will work with you through the issue." + ] + }, + { + "cell_type": "markdown", + "id": "5fc5001c-7715-4ebe-b0f7-e4bd04349629", + "metadata": {}, + "source": [ + "### Classification using KNN\n", + "\n", + "Let's set up our workspace and use the **Wine dataset** from `scikit-learn`. This dataset contains 178 wine samples with 13 chemical features, used to classify wines into different classes based on their origin.\n", + "\n", + "The **response variable** is `class`, which indicates the type of wine. We'll use all of the chemical features to predict this response variable." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "4a3485d6-ba58-4660-a983-5680821c5719", + "metadata": {}, + "outputs": [], + "source": [ + "# Import standard libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import recall_score, precision_score\n", + "from sklearn.model_selection import cross_validate\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_wine\n", + "\n", + "# Load the Wine dataset\n", + "wine_data = load_wine()\n", + "\n", + "# Convert to DataFrame\n", + "wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\n", + "\n", + "# Bind the 'class' (wine target) to the DataFrame\n", + "wine_df['class'] = wine_data.target\n", + "\n", + "# Display the DataFrame\n", + "wine_df\n" + ] + }, + { + "cell_type": "markdown", + "id": "721b2b17", + "metadata": {}, + "source": [ + "#### **Question 1:** \n", + "#### Data inspection\n", + "\n", + "Before fitting any model, it is essential to understand our data. **Use Python code** to answer the following questions about the **Wine dataset**:\n", + "\n", + "_(i)_ How many observations (rows) does the dataset contain?" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "56916892", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "178" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# count number of rows\n", + "wine_df.shape[0]" + ] + }, + { + "cell_type": "markdown", + "id": "f7573b59", + "metadata": {}, + "source": [ + "_(ii)_ How many variables (columns) does the dataset contain?" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "df0ef103", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# count number of columns\n", + "wine_df.shape[1]" + ] + }, + { + "cell_type": "markdown", + "id": "cb5180c7", + "metadata": {}, + "source": [ + "_(iii)_ What is the 'variable type' of the response variable `class` (e.g., 'integer', 'category', etc.)? What are the 'levels' (unique values) of the variable?" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "47989426", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('int32')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# data type of column class\n", + "wine_df.dtypes['class']" + ] + }, + { + "cell_type": "markdown", + "id": "a25f5e1b", + "metadata": {}, + "source": [ + "\n", + "_(iv)_ How many predictor variables do we have (Hint: all variables other than `class`)? " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bd7b0910", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 178 entries, 0 to 177\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 178 non-null float64\n", + " 1 malic_acid 178 non-null float64\n", + " 2 ash 178 non-null float64\n", + " 3 alcalinity_of_ash 178 non-null float64\n", + " 4 magnesium 178 non-null float64\n", + " 5 total_phenols 178 non-null float64\n", + " 6 flavanoids 178 non-null float64\n", + " 7 nonflavanoid_phenols 178 non-null float64\n", + " 8 proanthocyanins 178 non-null float64\n", + " 9 color_intensity 178 non-null float64\n", + " 10 hue 178 non-null float64\n", + " 11 od280/od315_of_diluted_wines 178 non-null float64\n", + " 12 proline 178 non-null float64\n", + " 13 class 178 non-null int32 \n", + "dtypes: float64(13), int32(1)\n", + "memory usage: 18.9 KB\n" + ] + } + ], + "source": [ + "# Number of predictor variables is 12\n", + "wine_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "156cc83a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wine_df" + ] + }, + { + "cell_type": "markdown", + "id": "d631e8e3", + "metadata": {}, + "source": [ + "You can use `print()` and `describe()` to help answer these questions." + ] + }, + { + "cell_type": "markdown", + "id": "fa3832d7", + "metadata": {}, + "source": [ + "#### **Question 2:** \n", + "#### Standardization and data-splitting\n", + "\n", + "Next, we must preform 'pre-processing' or 'data munging', to prepare our data for classification/prediction. For KNN, there are three essential steps. A first essential step is to 'standardize' the predictor variables. We can achieve this using the scaler method, provided as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "cc899b59", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], + "source": [ + "# Select predictors (excluding the last column)\n", + "predictors = wine_df.iloc[:, :-1]\n", + "\n", + "# Standardize the predictors\n", + "scaler = StandardScaler()\n", + "predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)\n", + "\n", + "# Display the head of the standardized predictors\n", + "print(predictors_standardized.head())" + ] + }, + { + "cell_type": "markdown", + "id": "9981ca48", + "metadata": {}, + "source": [ + "(i) Why is it important to standardize the predictor variables?" + ] + }, + { + "cell_type": "markdown", + "id": "403ef0bb", + "metadata": {}, + "source": [ + "> To make sure all the predictor valiables to have same scale therefore none of them will be dominated because of large scale and skew the classifcation result when using machine learning models that rely on distance metrics." + ] + }, + { + "cell_type": "markdown", + "id": "8e2e1bea", + "metadata": {}, + "source": [ + "(ii) Why did we elect not to standard our response variable `Class`?" + ] + }, + { + "cell_type": "markdown", + "id": "fdee5a15", + "metadata": {}, + "source": [ + "> This is the variable we want to determine through the model and its scale would not affect the classification result " + ] + }, + { + "cell_type": "markdown", + "id": "8077ec21", + "metadata": {}, + "source": [ + "(iii) A second essential step is to set a random seed. Do so below (Hint: use the random.seed function). Why is setting a seed important? Is the particular seed value important? Why or why not?" + ] + }, + { + "cell_type": "markdown", + "id": "f0676c21", + "metadata": {}, + "source": [ + "> Setting random seeed is important because it allow us to control the randomness in our code. Therefore we can repoduce the same result after running the code and do comparison or testing." + ] + }, + { + "cell_type": "markdown", + "id": "36ab9229", + "metadata": {}, + "source": [ + "(iv) A third essential step is to split our standardized data into separate training and testing sets. We will split into 75% training and 25% testing. The provided code randomly partitions our data, and creates linked training sets for the predictors and response variables. \n", + "\n", + "Extend the code to create a non-overlapping test set for the predictors and response variables." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e4a9dda", + "metadata": {}, + "outputs": [], + "source": [ + "# set a seed for reproducibility\n", + "np.random.seed(123)\n", + "# split the data into a training and testing set. hint: use train_test_split !\n", + "# Your code here ..." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "72c101f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Do not touch - PLEASE IGNORE THIS CELL\n", + "#np.random.seed(123)\n", + "# Create a random vector of True and False values to split the data\n", + "#split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "af2f9ce3", + "metadata": {}, + "outputs": [], + "source": [ + "# split the data into training and testing set\n", + "#\n", + "predictor_S_train, predictor_S_test, label_c_train, label_c_test= train_test_split(\n", + " predictors_standardized, wine_df['class'], train_size=0.75, shuffle= True,\n", + " stratify=wine_df[\"class\"], \n", + " random_state= 123\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "ea62ffc2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 133 entries, 28 to 109\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 133 non-null float64\n", + " 1 malic_acid 133 non-null float64\n", + " 2 ash 133 non-null float64\n", + " 3 alcalinity_of_ash 133 non-null float64\n", + " 4 magnesium 133 non-null float64\n", + " 5 total_phenols 133 non-null float64\n", + " 6 flavanoids 133 non-null float64\n", + " 7 nonflavanoid_phenols 133 non-null float64\n", + " 8 proanthocyanins 133 non-null float64\n", + " 9 color_intensity 133 non-null float64\n", + " 10 hue 133 non-null float64\n", + " 11 od280/od315_of_diluted_wines 133 non-null float64\n", + " 12 proline 133 non-null float64\n", + "dtypes: float64(13)\n", + "memory usage: 14.5 KB\n" + ] + } + ], + "source": [ + "predictor_S_train.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "0159273e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 45 entries, 102 to 177\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 45 non-null float64\n", + " 1 malic_acid 45 non-null float64\n", + " 2 ash 45 non-null float64\n", + " 3 alcalinity_of_ash 45 non-null float64\n", + " 4 magnesium 45 non-null float64\n", + " 5 total_phenols 45 non-null float64\n", + " 6 flavanoids 45 non-null float64\n", + " 7 nonflavanoid_phenols 45 non-null float64\n", + " 8 proanthocyanins 45 non-null float64\n", + " 9 color_intensity 45 non-null float64\n", + " 10 hue 45 non-null float64\n", + " 11 od280/od315_of_diluted_wines 45 non-null float64\n", + " 12 proline 45 non-null float64\n", + "dtypes: float64(13)\n", + "memory usage: 4.9 KB\n" + ] + } + ], + "source": [ + "predictor_S_test.info()" + ] + }, + { + "cell_type": "markdown", + "id": "4604ee03", + "metadata": {}, + "source": [ + "#### **Question 3:**\n", + "#### Model initialization and cross-validation\n", + "We are finally set to fit the KNN model. \n", + "\n", + "\n", + "Perform a grid search to tune the `n_neighbors` hyperparameter using 10-fold cross-validation. Follow these steps:\n", + "\n", + "1. Initialize the KNN classifier using `KNeighborsClassifier()`.\n", + "2. Define a parameter grid for `n_neighbors` ranging from 1 to 50.\n", + "3. Implement a grid search using `GridSearchCV` with 10-fold cross-validation to find the optimal number of neighbors.\n", + "4. After fitting the model on the training data, identify and return the best value for `n_neighbors` based on the grid search results." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "334fc8d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
01.518613-0.5622500.232053-1.1695931.9139050.8089971.034819-0.6595631.2248840.2517170.3621771.8479201.0130090
10.246290-0.499413-0.827996-2.4908470.0181450.5686480.733629-0.820719-0.544721-0.2933210.4060511.1134490.9652420
20.1968790.0212311.109334-0.2687380.0883580.8089971.215533-0.4984072.1359680.2690200.3183040.7885871.3951480
31.691550-0.3468110.487926-0.8092510.9309182.4914461.466525-0.9818751.0321551.186068-0.4275441.1840712.3345740
40.2957000.2276941.8404030.4519461.2819850.8089970.6633510.2267960.401404-0.3192760.3621770.449601-0.0378740
.............................................
1730.8762752.9745430.3051590.301803-0.332922-0.985614-1.4249001.274310-0.9301791.142811-1.392758-1.231206-0.0219522
1740.4933431.4126090.4148201.0525160.158572-0.793334-1.2843440.549108-0.3169500.969783-1.129518-1.4854450.0098932
1750.3327581.744744-0.3893550.1516611.422412-1.129824-1.3445820.549108-0.4220752.224236-1.612125-1.4854450.2805752
1760.2092320.2276940.0127320.1516611.422412-1.033684-1.3546221.354888-0.2293461.834923-1.568252-1.4006990.2964982
1771.3950861.5831651.3652081.502943-0.262708-0.392751-1.2743051.596623-0.4220751.791666-1.524378-1.428948-0.5951602
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + ".. ... ... ... ... ... \n", + "173 0.876275 2.974543 0.305159 0.301803 -0.332922 \n", + "174 0.493343 1.412609 0.414820 1.052516 0.158572 \n", + "175 0.332758 1.744744 -0.389355 0.151661 1.422412 \n", + "176 0.209232 0.227694 0.012732 0.151661 1.422412 \n", + "177 1.395086 1.583165 1.365208 1.502943 -0.262708 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + ".. ... ... ... ... \n", + "173 -0.985614 -1.424900 1.274310 -0.930179 \n", + "174 -0.793334 -1.284344 0.549108 -0.316950 \n", + "175 -1.129824 -1.344582 0.549108 -0.422075 \n", + "176 -1.033684 -1.354622 1.354888 -0.229346 \n", + "177 -0.392751 -1.274305 1.596623 -0.422075 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline class \n", + "0 0.251717 0.362177 1.847920 1.013009 0 \n", + "1 -0.293321 0.406051 1.113449 0.965242 0 \n", + "2 0.269020 0.318304 0.788587 1.395148 0 \n", + "3 1.186068 -0.427544 1.184071 2.334574 0 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 0 \n", + ".. ... ... ... ... ... \n", + "173 1.142811 -1.392758 -1.231206 -0.021952 2 \n", + "174 0.969783 -1.129518 -1.485445 0.009893 2 \n", + "175 2.224236 -1.612125 -1.485445 0.280575 2 \n", + "176 1.834923 -1.568252 -1.400699 0.296498 2 \n", + "177 1.791666 -1.524378 -1.428948 -0.595160 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# prepare and scale the df to initialize the knn model\n", + "# create a df that only class is not scaled but the other columns are standardized.\n", + "\n", + "standardized_wine = wine_df.copy()\n", + "\n", + "columns_to_exclude = ['class']\n", + "\n", + "columns_to_scale = standardized_wine.columns.difference(columns_to_exclude)\n", + "\n", + "scaler = StandardScaler()\n", + "standardized_wine[columns_to_scale] = scaler.fit_transform(standardized_wine[columns_to_scale])\n", + "standardized_wine" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2f36e25c", + "metadata": {}, + "outputs": [], + "source": [ + "# split the data into training and testing set\n", + "wine_train, wine_test = train_test_split(\n", + " standardized_wine, train_size=0.75, shuffle= True,\n", + " stratify=standardized_wine[\"class\"], random_state= 123\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "73531bc8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 133 entries, 78 to 66\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 133 non-null float64\n", + " 1 malic_acid 133 non-null float64\n", + " 2 ash 133 non-null float64\n", + " 3 alcalinity_of_ash 133 non-null float64\n", + " 4 magnesium 133 non-null float64\n", + " 5 total_phenols 133 non-null float64\n", + " 6 flavanoids 133 non-null float64\n", + " 7 nonflavanoid_phenols 133 non-null float64\n", + " 8 proanthocyanins 133 non-null float64\n", + " 9 color_intensity 133 non-null float64\n", + " 10 hue 133 non-null float64\n", + " 11 od280/od315_of_diluted_wines 133 non-null float64\n", + " 12 proline 133 non-null float64\n", + " 13 class 133 non-null int32 \n", + "dtypes: float64(13), int32(1)\n", + "memory usage: 15.1 KB\n" + ] + } + ], + "source": [ + "wine_train.info() # to check if the split is right" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c4d9c807", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 45 entries, 102 to 177\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 45 non-null float64\n", + " 1 malic_acid 45 non-null float64\n", + " 2 ash 45 non-null float64\n", + " 3 alcalinity_of_ash 45 non-null float64\n", + " 4 magnesium 45 non-null float64\n", + " 5 total_phenols 45 non-null float64\n", + " 6 flavanoids 45 non-null float64\n", + " 7 nonflavanoid_phenols 45 non-null float64\n", + " 8 proanthocyanins 45 non-null float64\n", + " 9 color_intensity 45 non-null float64\n", + " 10 hue 45 non-null float64\n", + " 11 od280/od315_of_diluted_wines 45 non-null float64\n", + " 12 proline 45 non-null float64\n", + " 13 class 45 non-null int32 \n", + "dtypes: float64(13), int32(1)\n", + "memory usage: 5.1 KB\n" + ] + } + ], + "source": [ + "wine_test.info()" + ] + }, + { + "cell_type": "markdown", + "id": "905ed370", + "metadata": {}, + "source": [ + "Question 3 - point 1" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "08818c64", + "metadata": {}, + "outputs": [], + "source": [ + "# initiate KNN\n", + "knn = KNeighborsClassifier(n_neighbors=5)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "42b204f2", + "metadata": {}, + "outputs": [], + "source": [ + "# define x and y for KNN\n", + "X1 = predictor_S_train\n", + "y1 = label_c_train" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5f52c83b", + "metadata": {}, + "outputs": [], + "source": [ + "# define x and y for KNN\n", + "X = standardized_wine[columns_to_scale]\n", + "y = standardized_wine['class']" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "44a9ab17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier()" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fitting KNN\n", + "knn.fit(X1,y1)" + ] + }, + { + "cell_type": "markdown", + "id": "9ffb8bf8", + "metadata": {}, + "source": [ + "Question 3 point 2" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "58c21754", + "metadata": {}, + "outputs": [], + "source": [ + "# implementing a gridSearch , define pararmeter grid, riging from 1 to 50\n", + "parameter_grid = {\n", + " \"n_neighbors\": range(1, 50, 3),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "ddf8185b", + "metadata": {}, + "source": [ + "Question 3 point 3 " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b3ad80ad", + "metadata": {}, + "outputs": [], + "source": [ + "# use function to search best K -- implementing a gridSearch \n", + "wine_tune_grid = GridSearchCV(\n", + " estimator=knn,\n", + " param_grid=parameter_grid,\n", + " cv=10\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "672c7471", + "metadata": {}, + "source": [ + "Question 3 - point 4" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "9fcf66a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n",
+       "             param_grid={'n_neighbors': range(1, 50, 3)})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n", + " param_grid={'n_neighbors': range(1, 50, 3)})" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fitting the x and y\n", + "wine_tune_grid.fit(\n", + " wine_train[columns_to_scale],\n", + " wine_train[\"class\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "e0cca0de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paramsmean_test_score
0{'n_neighbors': 1}0.954396
1{'n_neighbors': 4}0.954945
2{'n_neighbors': 7}0.977473
3{'n_neighbors': 10}0.954396
4{'n_neighbors': 13}0.977473
5{'n_neighbors': 16}0.962637
6{'n_neighbors': 19}0.962637
7{'n_neighbors': 22}0.970330
8{'n_neighbors': 25}0.954945
9{'n_neighbors': 28}0.962637
10{'n_neighbors': 31}0.955495
11{'n_neighbors': 34}0.963187
12{'n_neighbors': 37}0.962637
13{'n_neighbors': 40}0.954945
14{'n_neighbors': 43}0.954945
15{'n_neighbors': 46}0.947253
16{'n_neighbors': 49}0.947253
\n", + "
" + ], + "text/plain": [ + " params mean_test_score\n", + "0 {'n_neighbors': 1} 0.954396\n", + "1 {'n_neighbors': 4} 0.954945\n", + "2 {'n_neighbors': 7} 0.977473\n", + "3 {'n_neighbors': 10} 0.954396\n", + "4 {'n_neighbors': 13} 0.977473\n", + "5 {'n_neighbors': 16} 0.962637\n", + "6 {'n_neighbors': 19} 0.962637\n", + "7 {'n_neighbors': 22} 0.970330\n", + "8 {'n_neighbors': 25} 0.954945\n", + "9 {'n_neighbors': 28} 0.962637\n", + "10 {'n_neighbors': 31} 0.955495\n", + "11 {'n_neighbors': 34} 0.963187\n", + "12 {'n_neighbors': 37} 0.962637\n", + "13 {'n_neighbors': 40} 0.954945\n", + "14 {'n_neighbors': 43} 0.954945\n", + "15 {'n_neighbors': 46} 0.947253\n", + "16 {'n_neighbors': 49} 0.947253" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check out the accuracy\n", + "accuracies_grid = pd.DataFrame(wine_tune_grid.cv_results_)\n", + "#accuracies_grid\n", + "accuracies_grid [[\"params\",\"mean_test_score\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "369cdf3b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_neighbors': 7}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# optimal number of neighbours\n", + "wine_tune_grid.best_params_" + ] + }, + { + "cell_type": "markdown", + "id": "3f76bf62", + "metadata": {}, + "source": [ + "#### **Question 4:**\n", + "#### Model evaluation\n", + "\n", + "Using the best value for `n_neighbors`, fit a KNN model on the training data and evaluate its performance on the test set using `accuracy_score`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffefa9f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Your code here..." + ] + }, + { + "cell_type": "markdown", + "id": "6f8a69db", + "metadata": {}, + "source": [ + "# Criteria\n", + "\n", + "\n", + "| **Criteria** | **Complete** | **Incomplete** |\n", + "|--------------------------------------------------------|---------------------------------------------------|--------------------------------------------------|\n", + "| **Data Inspection** | Data is inspected for number of variables, observations and data types. | Data inspection is missing or incomplete. |\n", + "| **Data Scaling** | Data scaling or normalization is applied where necessary (e.g., using `StandardScaler`). | Data scaling or normalization is missing or incorrectly applied. |\n", + "| **Model Initialization** | The KNN model is correctly initialized and a random seed is set for reproducibility. | The KNN model is not initialized, is incorrect, or lacks a random seed for reproducibility. |\n", + "| **Parameter Grid for `n_neighbors`** | The parameter grid for `n_neighbors` is correctly defined. | The parameter grid is missing or incorrectly defined. |\n", + "| **Cross-Validation Setup** | Cross-validation is set up correctly with 10 folds. | Cross-validation is missing or incorrectly set up. |\n", + "| **Best Hyperparameter (`n_neighbors`) Selection** | The best value for `n_neighbors` is identified using the grid search results. | The best `n_neighbors` is not selected or incorrect. |\n", + "| **Model Evaluation on Test Data** | The model is evaluated on the test data using accuracy. | The model evaluation is missing or uses the wrong metric. |\n" + ] + }, + { + "cell_type": "markdown", + "id": "0b4390cc", + "metadata": {}, + "source": [ + "## Submission Information\n", + "\n", + "🚨 **Please review our [Assignment Submission Guide](https://github.com/UofT-DSI/onboarding/blob/main/onboarding_documents/submissions.md)** 🚨 for detailed instructions on how to format, branch, and submit your work. Following these guidelines is crucial for your submissions to be evaluated correctly.\n", + "\n", + "### Note:\n", + "\n", + "If you like, you may collaborate with others in the cohort. If you choose to do so, please indicate with whom you have worked with in your pull request by tagging their GitHub username. Separate submissions are required.\n", + "\n", + "### Submission Parameters:\n", + "* Submission Due Date: `11:59 PM - 01/12/2025`\n", + "* The branch name for your repo should be: `assignment-1`\n", + "* What to submit for this assignment:\n", + " * This Jupyter Notebook (assignment_1.ipynb) should be populated and should be the only change in your pull request.\n", + "* What the pull request link should look like for this assignment: `https://github.com//LCR/pull/`\n", + " * Open a private window in your browser. Copy and paste the link to your pull request into the address bar. Make sure you can see your pull request properly. This helps the technical facilitator and learning support staff review your submission easily.\n", + "\n", + "Checklist:\n", + "- [ ] Created a branch with the correct naming convention.\n", + "- [ ] Ensured that the repository is public.\n", + "- [ ] Reviewed the PR description guidelines and adhered to them.\n", + "- [ ] Verify that the link is accessible in a private browser window.\n", + "\n", + "If you encounter any difficulties or have questions, please don't hesitate to reach out to our team via our Slack at `#cohort-4-help`. Our Technical Facilitators and Learning Support staff are here to help you navigate any challenges.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dsi_participant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/02_activities/simple sample for train test split.ipynb b/02_activities/simple sample for train test split.ipynb new file mode 100644 index 000000000..3f32ef7e9 --- /dev/null +++ b/02_activities/simple sample for train test split.ipynb @@ -0,0 +1,170 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0, 1],\n", + " [2, 3],\n", + " [4, 5],\n", + " [6, 7],\n", + " [8, 9]])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X, y = np.arange(10).reshape((5, 2)), range(5)\n", + "\n", + "\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0, 1, 2, 3, 4]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list (y)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + "\n", + " X, y, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[8, 9],\n", + " [4, 5],\n", + " [0, 1],\n", + " [6, 7]])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[2, 3]])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[4, 2, 0, 3]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dsi_participant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/load_file_test.ipynb b/load_file_test.ipynb new file mode 100644 index 000000000..9c355e1f9 --- /dev/null +++ b/load_file_test.ipynb @@ -0,0 +1,438 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# load in libraries\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "from mpl_toolkits import mplot3d" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0842302M17.9910.38122.801001.00.118400.277600.300100.14710...25.38017.33184.602019.00.162200.665600.71190.26540.46010.11890
1842517M20.5717.77132.901326.00.084740.078640.086900.07017...24.99023.41158.801956.00.123800.186600.24160.18600.27500.08902
284300903M19.6921.25130.001203.00.109600.159900.197400.12790...23.57025.53152.501709.00.144400.424500.45040.24300.36130.08758
384348301M11.4220.3877.58386.10.142500.283900.241400.10520...14.91026.5098.87567.70.209800.866300.68690.25750.66380.17300
484358402M20.2914.34135.101297.00.100300.132800.198000.10430...22.54016.67152.201575.00.137400.205000.40000.16250.23640.07678
..................................................................
564926424M21.5622.39142.001479.00.111000.115900.243900.13890...25.45026.40166.102027.00.141000.211300.41070.22160.20600.07115
565926682M20.1328.25131.201261.00.097800.103400.144000.09791...23.69038.25155.001731.00.116600.192200.32150.16280.25720.06637
566926954M16.6028.08108.30858.10.084550.102300.092510.05302...18.98034.12126.701124.00.113900.309400.34030.14180.22180.07820
567927241M20.6029.33140.101265.00.117800.277000.351400.15200...25.74039.42184.601821.00.165000.868100.93870.26500.40870.12400
56892751B7.7624.5447.92181.00.052630.043620.000000.00000...9.45630.3759.16268.60.089960.064440.00000.00000.28710.07039
\n", + "

569 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", + "0 842302 M 17.99 10.38 122.80 1001.0 \n", + "1 842517 M 20.57 17.77 132.90 1326.0 \n", + "2 84300903 M 19.69 21.25 130.00 1203.0 \n", + "3 84348301 M 11.42 20.38 77.58 386.1 \n", + "4 84358402 M 20.29 14.34 135.10 1297.0 \n", + ".. ... ... ... ... ... ... \n", + "564 926424 M 21.56 22.39 142.00 1479.0 \n", + "565 926682 M 20.13 28.25 131.20 1261.0 \n", + "566 926954 M 16.60 28.08 108.30 858.1 \n", + "567 927241 M 20.60 29.33 140.10 1265.0 \n", + "568 92751 B 7.76 24.54 47.92 181.0 \n", + "\n", + " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", + "0 0.11840 0.27760 0.30010 0.14710 \n", + "1 0.08474 0.07864 0.08690 0.07017 \n", + "2 0.10960 0.15990 0.19740 0.12790 \n", + "3 0.14250 0.28390 0.24140 0.10520 \n", + "4 0.10030 0.13280 0.19800 0.10430 \n", + ".. ... ... ... ... \n", + "564 0.11100 0.11590 0.24390 0.13890 \n", + "565 0.09780 0.10340 0.14400 0.09791 \n", + "566 0.08455 0.10230 0.09251 0.05302 \n", + "567 0.11780 0.27700 0.35140 0.15200 \n", + "568 0.05263 0.04362 0.00000 0.00000 \n", + "\n", + " ... radius_worst texture_worst perimeter_worst area_worst \\\n", + "0 ... 25.380 17.33 184.60 2019.0 \n", + "1 ... 24.990 23.41 158.80 1956.0 \n", + "2 ... 23.570 25.53 152.50 1709.0 \n", + "3 ... 14.910 26.50 98.87 567.7 \n", + "4 ... 22.540 16.67 152.20 1575.0 \n", + ".. ... ... ... ... ... \n", + "564 ... 25.450 26.40 166.10 2027.0 \n", + "565 ... 23.690 38.25 155.00 1731.0 \n", + "566 ... 18.980 34.12 126.70 1124.0 \n", + "567 ... 25.740 39.42 184.60 1821.0 \n", + "568 ... 9.456 30.37 59.16 268.6 \n", + "\n", + " smoothness_worst compactness_worst concavity_worst \\\n", + "0 0.16220 0.66560 0.7119 \n", + "1 0.12380 0.18660 0.2416 \n", + "2 0.14440 0.42450 0.4504 \n", + "3 0.20980 0.86630 0.6869 \n", + "4 0.13740 0.20500 0.4000 \n", + ".. ... ... ... \n", + "564 0.14100 0.21130 0.4107 \n", + "565 0.11660 0.19220 0.3215 \n", + "566 0.11390 0.30940 0.3403 \n", + "567 0.16500 0.86810 0.9387 \n", + "568 0.08996 0.06444 0.0000 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + ".. ... ... ... \n", + "564 0.2216 0.2060 0.07115 \n", + "565 0.1628 0.2572 0.06637 \n", + "566 0.1418 0.2218 0.07820 \n", + "567 0.2650 0.4087 0.12400 \n", + "568 0.0000 0.2871 0.07039 \n", + "\n", + "[569 rows x 32 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# loading data\n", + "\n", + "cancer = pd.read_csv('01_materials/notebooks/dataset/wdbc.csv')\n", + "cancer" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dsi_participant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}