diff --git a/01_materials/notebooks/Classification-1.ipynb b/01_materials/notebooks/Classification-1.ipynb index 7b6959a7a..5b798239b 100644 --- a/01_materials/notebooks/Classification-1.ipynb +++ b/01_materials/notebooks/Classification-1.ipynb @@ -2326,7 +2326,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "dsi_participant", "language": "python", "name": "python3" }, @@ -2340,7 +2340,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index e50cc66eb..8a4221ed9 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 91, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +56,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 92, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -76,6 +354,154 @@ "wine_df\n" ] }, + { + "cell_type": "code", + "execution_count": 93, + "id": "bcc00857", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Keys: dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])\n", + "\n", + "Description: .. _wine_dataset:\n", + "\n", + "Wine recognition dataset\n", + "------------------------\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + ":Number of Instances: 178\n", + ":Number of Attributes: 13 numeric, predictive attributes and the class\n", + ":Attribute Information:\n", + " - Alcohol\n", + " - Malic acid\n", + " - Ash\n", + " - Alcalinity of ash\n", + " - Magnesium\n", + " - Total phenols\n", + " - Flavanoids\n", + " - Nonflavanoid phenols\n", + " - Proanthocyanins\n", + " - Color intensity\n", + " - Hue\n", + " - OD280/OD315 of diluted wines\n", + " - Proline\n", + " - class:\n", + " - class_0\n", + " - class_1\n", + " - class_2\n", + "\n", + ":Summary Statistics:\n", + "\n", + "============================= ==== ===== ======= =====\n", + " Min Max Mean SD\n", + "============================= ==== ===== ======= =====\n", + "Alcohol: 11.0 14.8 13.0 0.8\n", + "Malic Acid: 0.74 5.80 2.34 1.12\n", + "Ash: 1.36 3.23 2.36 0.27\n", + "Alcalinity of Ash: 10.6 30.0 19.5 3.3\n", + "Magnesium: 70.0 162.0 99.7 14.3\n", + "Total Phenols: 0.98 3.88 2.29 0.63\n", + "Flavanoids: 0.34 5.08 2.03 1.00\n", + "Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n", + "Proanthocyanins: 0.41 3.58 1.59 0.57\n", + "Colour Intensity: 1.3 13.0 5.1 2.3\n", + "Hue: 0.48 1.71 0.96 0.23\n", + "OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n", + "Proline: 278 1680 746 315\n", + "============================= ==== ===== ======= =====\n", + "\n", + ":Missing Attribute Values: None\n", + ":Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n", + ":Creator: R.A. Fisher\n", + ":Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", + ":Date: July, 1988\n", + "\n", + "This is a copy of UCI ML Wine recognition datasets.\n", + "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n", + "\n", + "The data is the results of a chemical analysis of wines grown in the same\n", + "region in Italy by three different cultivators. There are thirteen different\n", + "measurements taken for different constituents found in the three types of\n", + "wine.\n", + "\n", + "Original Owners:\n", + "\n", + "Forina, M. et al, PARVUS -\n", + "An Extendible Package for Data Exploration, Classification and Correlation.\n", + "Institute of Pharmaceutical and Food Analysis and Technologies,\n", + "Via Brigata Salerno, 16147 Genoa, Italy.\n", + "\n", + "Citation:\n", + "\n", + "Lichman, M. (2013). UCI Machine Learning Repository\n", + "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n", + "School of Information and Computer Science.\n", + "\n", + ".. dropdown:: References\n", + "\n", + " (1) S. Aeberhard, D. Coomans and O. de Vel,\n", + " Comparison of Classifiers in High Dimensional Settings,\n", + " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of\n", + " Mathematics and Statistics, James Cook University of North Queensland.\n", + " (Also submitted to Technometrics).\n", + "\n", + " The data was used with many others for comparing various\n", + " classifiers. The classes are separable, though only RDA\n", + " has achieved 100% correct classification.\n", + " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data))\n", + " (All results using the leave-one-out technique)\n", + "\n", + " (2) S. Aeberhard, D. Coomans and O. de Vel,\n", + " \"THE CLASSIFICATION PERFORMANCE OF RDA\"\n", + " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of\n", + " Mathematics and Statistics, James Cook University of North Queensland.\n", + " (Also submitted to Journal of Chemometrics).\n", + "\n", + "\n", + "Feature Name: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']\n", + "\n", + "Target Name: ['class_0' 'class_1' 'class_2']\n", + "\n", + "Data: [[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00\n", + " 2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]\n", + " [1.320e+01 1.780e+00 2.140e+00 1.120e+01 1.000e+02 2.650e+00 2.760e+00\n", + " 2.600e-01 1.280e+00 4.380e+00 1.050e+00 3.400e+00 1.050e+03]\n", + " [1.316e+01 2.360e+00 2.670e+00 1.860e+01 1.010e+02 2.800e+00 3.240e+00\n", + " 3.000e-01 2.810e+00 5.680e+00 1.030e+00 3.170e+00 1.185e+03]\n", + " [1.437e+01 1.950e+00 2.500e+00 1.680e+01 1.130e+02 3.850e+00 3.490e+00\n", + " 2.400e-01 2.180e+00 7.800e+00 8.600e-01 3.450e+00 1.480e+03]\n", + " [1.324e+01 2.590e+00 2.870e+00 2.100e+01 1.180e+02 2.800e+00 2.690e+00\n", + " 3.900e-01 1.820e+00 4.320e+00 1.040e+00 2.930e+00 7.350e+02]]\n", + "\n", + "Target: [0 0 0 0 0]\n" + ] + } + ], + "source": [ + "#Display the dataset keys\n", + "print(f\"\\nKeys: {wine_data.keys()}\")\n", + "\n", + "#Display the dataset description\n", + "print(f\"\\nDescription: {wine_data.DESCR}\")\n", + "\n", + "#Display the feature names\n", + "print(f\"\\nFeature Name: {wine_data.feature_names}\")\n", + "\n", + "#Display the target names\n", + "print(f\"\\nTarget Name: {wine_data.target_names}\")\n", + "\n", + "#Display the first 5 rows of data\n", + "print(f\"\\nData: {wine_data.data[:5]}\")\n", + "\n", + "#display the first 5 target values\n", + "print(f\"\\nTarget: {wine_data.target[:5]}\")\n" + ] + }, { "cell_type": "markdown", "id": "721b2b17", @@ -91,12 +517,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 94, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 178 entries, 0 to 177\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 178 non-null float64\n", + " 1 malic_acid 178 non-null float64\n", + " 2 ash 178 non-null float64\n", + " 3 alcalinity_of_ash 178 non-null float64\n", + " 4 magnesium 178 non-null float64\n", + " 5 total_phenols 178 non-null float64\n", + " 6 flavanoids 178 non-null float64\n", + " 7 nonflavanoid_phenols 178 non-null float64\n", + " 8 proanthocyanins 178 non-null float64\n", + " 9 color_intensity 178 non-null float64\n", + " 10 hue 178 non-null float64\n", + " 11 od280/od315_of_diluted_wines 178 non-null float64\n", + " 12 proline 178 non-null float64\n", + " 13 class 178 non-null int64 \n", + "dtypes: float64(13), int64(1)\n", + "memory usage: 19.6 KB\n", + "Info: None\n", + "\n", + "Number of rows: 178\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here: 178\n", + "\n", + "# Inspect the wine_df DataFrame using the .info() method\n", + "print(f\"Info: {wine_df.info()}\") #There should be 178 entries and 14 columns\n", + "\n", + "#Using the .shape attribute to find the number of rows\n", + "print(f\"\\nNumber of rows: {wine_df.shape[0]}\") #There should be 178 entries\n" ] }, { @@ -109,12 +572,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 95, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Number of columns: 14\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here: 14\n", + "\n", + "#Using the .shape attribute to find the number of columns \n", + "print(f\"\\nNumber of columns: {wine_df.shape[1]}\") \n" ] }, { @@ -127,12 +602,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 96, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Variable type: int64\n", + "Unique value: 3\n", + "Value(s): [0 1 2]\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here: Integer with 3 unique values/levels (0 1 2)\n", + "\n", + "print(f\"\\nVariable type: {wine_df['class'].dtype}\") #The type should be integer (int64)\n", + "print(f\"Unique value: {wine_df['class'].nunique()}\") #There should be 3 unique values\n", + "print(f\"Value(s): {wine_df['class'].unique()}\") #The values should be 0, 1, and 2\n", + "\n" ] }, { @@ -146,12 +637,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 97, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Number of predictor variables: 13\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here: 13\n", + "\n", + "#Number of predictor variables (features)\n", + "print(f\"\\nNumber of predictor variables: {wine_df.shape[1] - 1}\") #Number of columns - target\n" ] }, { @@ -175,10 +678,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 98, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +734,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + ">We standardize so we can avoid having big numbers will completely overshadow the small ones. This step ensures all 13 predictor variables (features) have an equal chance to influence classification." ] }, { @@ -220,7 +750,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "> We standardize predictor variables only and exclude the response variable 'Class' because it’s just the label/category (Class: 0, 1, or 2) we’re trying to predict." ] }, { @@ -236,7 +766,7 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "> Setting a seed makes random processes repeatable, which is useful for reproducibility, debugging and comparing models since they all train/test on the same results." ] }, { @@ -251,17 +781,122 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 99, "id": "72c101f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X_train shape: (133, 13)\n", + "X_test shape: (45, 13)\n", + "y_train shape: (133,)\n", + "y_test shape: (45,)\n", + "X_train (first 5 rows):\n", + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "78 -0.828391 -1.208567 -1.522511 -1.409821 2.545825 \n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "15 0.777454 -0.472483 1.218995 -0.689137 0.860705 \n", + "13 2.160950 -0.544297 0.085839 -2.430790 -0.613775 \n", + "14 1.703902 -0.418624 0.049285 -2.250619 0.158572 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "78 -0.633101 -0.179981 -0.095517 2.048364 \n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "15 0.889114 0.884224 -0.498407 -0.229346 \n", + "13 1.289697 1.667318 0.549108 2.135968 \n", + "14 1.610163 1.617120 -0.578985 2.398780 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "78 -0.717240 0.449924 -0.426113 0.009893 \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "15 0.969783 1.415139 0.378979 1.793210 \n", + "13 0.147900 1.283518 0.167113 1.283691 \n", + "14 1.056297 1.064151 0.548472 2.547935 \n", + "\n", + "y_train (first 5 rows):\n", + "78 1\n", + "0 0\n", + "15 0\n", + "13 0\n", + "14 0\n", + "Name: class, dtype: int64 \n", + "\n", + "X_test (first 5 rows):\n", + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "102 -0.816038 0.102021 0.341713 0.451946 -0.122282 \n", + "84 -1.433671 -1.298334 0.780354 -0.448909 -0.403135 \n", + "96 -1.470729 -0.194208 1.365208 0.602088 2.405399 \n", + "65 -0.778980 -1.011081 0.707247 -0.418881 -0.122282 \n", + "79 -0.371343 1.376703 0.122392 1.052516 0.088358 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "102 0.424438 0.081051 -0.176095 -0.492158 \n", + "84 -0.152402 0.181447 -1.143031 1.330009 \n", + "96 -1.113800 -1.043392 -1.787656 -0.054137 \n", + "65 0.200111 0.623193 0.065639 0.856946 \n", + "79 0.857067 0.522796 0.549108 0.629175 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "102 -0.976782 -0.690784 1.085200 -0.983669 \n", + "84 -0.868639 -0.734657 0.661468 -0.722540 \n", + "96 -1.106553 -0.032683 -0.496736 -0.388168 \n", + "65 -0.198156 1.020278 -0.440238 -0.219390 \n", + "79 -1.076273 1.020278 0.732090 -0.904056 \n", + "\n", + "y_test (first 5 rows):\n", + "102 1\n", + "84 1\n", + "96 1\n", + "65 1\n", + "79 1\n", + "Name: class, dtype: int64\n" + ] + } + ], "source": [ "# set a seed for reproducibility\n", "np.random.seed(123)\n", "\n", - "# split the data into a training and testing set. hint: use train_test_split !\n", + "# # split the data into a training and testing set. hint: use train_test_split !\n", + "\n", + "# Features (already standardized)\n", + "X = predictors_standardized\n", + "\n", + "# Target/Response variable\n", + "y = wine_df[\"class\"]\n", + "\n", + "# Split into train/test (75/25), preserving class balance\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y,\n", + " train_size=0.75,\n", + " stratify=y\n", + ")\n", + "\n", + "# Display the shapes of the resulting datasets\n", + "print(\"X_train shape:\", X_train.shape)\n", + "print(\"X_test shape:\", X_test.shape)\n", + "print(\"y_train shape:\", y_train.shape)\n", + "print(\"y_test shape:\", y_test.shape)\n", + "\n", + "\n", + "\n", + "# Display training predictors\n", + "print(\"X_train (first 5 rows):\")\n", + "print(X_train.head(), \"\\n\")\n", + "\n", + "# Display training target\n", + "print(\"y_train (first 5 rows):\")\n", + "print(y_train.head(), \"\\n\")\n", "\n", - "# Your code here ..." + "# Display testing predictors\n", + "print(\"X_test (first 5 rows):\")\n", + "print(X_test.head(), \"\\n\")\n", + "\n", + "# Display testing target\n", + "print(\"y_test (first 5 rows):\")\n", + "print(y_test.head())\n" ] }, { @@ -284,12 +919,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 100, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimal k value: 7\n", + "Accuracy: 0.9774725274725276\n" + ] + } + ], "source": [ - "# Your code here..." + "# Step 1. Initialize the KNN model\n", + "knn = KNeighborsClassifier()\n", + "\n", + "# Step 2. Define a parameter grid for `n_neighbors` ranging from 1 to 50.\n", + "parameter_grid = {\n", + " \"n_neighbors\" : range(1,51) \n", + "} \n", + "\n", + "# Step 3. Implement a grid search using `GridSearchCV` with 10-fold cross-validation to find the optimal number of neighbors.\n", + "grid_search = GridSearchCV(\n", + " estimator=knn,\n", + " param_grid=parameter_grid,\n", + " cv=10,\n", + " scoring='accuracy'\n", + ") \n", + "\n", + "# Step 4. After fitting the model on the training data, identify and return the best value for `n_neighbors` based on the grid search results.\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "# Step 5. Evaluate the model's performance on the test set using the best found hyperparameter and report the accuracy.\n", + "print(\"Optimal k value:\", grid_search.best_params_[\"n_neighbors\"])\n", + "print(\"Accuracy:\", grid_search.best_score_)" ] }, { @@ -308,9 +973,193 @@ "execution_count": null, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test set accuracy: 0.9333333333333333\n" + ] + } + ], + "source": [ + "# Display the accuracy of each fold\n", + "optimal_k = grid_search.best_params_[\"n_neighbors\"]\n", + "\n", + "# Initialize KNN with the optimal k\n", + "knn_optimal = KNeighborsClassifier(n_neighbors=optimal_k)\n", + "knn_optimal.fit(X_train, y_train)\n", + "y_pred = knn_optimal.predict(X_test)\n", + "\n", + "print(\"Test set accuracy:\", accuracy_score(y_test, y_pred))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "cc6c5a90", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Confusion Matrix:\n", + "\n", + "Predicted 0 1 2\n", + "Actual \n", + "0 15 0 0\n", + "1 2 15 1\n", + "2 0 0 12\n" + ] + } + ], + "source": [ + "# Confusion Matrix using pandas crosstab\n", + "conf_matrix = pd.crosstab(\n", + " y_test, y_pred,\n", + " rownames=['Actual'],\n", + " colnames=['Predicted'],\n", + " dropna=False\n", + ")\n", + "print(\"\\nConfusion Matrix:\\n\")\n", + "print(conf_matrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "20511d0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Precision Recall\n", + "class_0 0.882 1.000\n", + "class_1 1.000 0.833\n", + "class_2 0.923 1.000\n" + ] + } + ], "source": [ - "# Your code here..." + "# Precision, Recall, F1 for each class\n", + "precision = precision_score(y_test, y_pred, average=None)\n", + "recall = recall_score(y_test, y_pred, average=None)\n", + "\n", + "# Put results into a DataFrame for readability\n", + "metrics_df = pd.DataFrame({\n", + " \"Precision\": precision.round(3),\n", + " \"Recall\": recall.round(3)\n", + "}, index=wine_data.target_names)\n", + "\n", + "print(metrics_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "f0bd353a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assignment 1: Wine Dataset KNN Classification\n", + "\n", + "Question 1: Dataset Summary\n", + "Total observations (rows): 178\n", + "Total variables (columns): 14\n", + "Total features: 13\n", + "Target classes: ['class_0', 'class_1', 'class_2']\n", + "Variable type: int64\n", + "Unique value: 3\n", + "Value(s): [0 1 2]\n", + "\n", + "Question 2: Standardization and data-splitting\n", + "Train/Test Split\n", + "--------------------------------------------------\n", + "Training set size: 133 samples\n", + "Test set size: 45 samples\n", + "\n", + "Question 3: Cross-Validation & Hyperparameter Tuning\n", + "Hyperparameter Tuning (10-fold Cross-Validation)\n", + "--------------------------------------------------\n", + "Optimal number of neighbors (k): 7\n", + "Mean CV Accuracy: 0.977\n", + "\n", + "Question 4: Model Evaluation\n", + "Performance on Test Set\n", + "--------------------------------------------------\n", + "Test Accuracy: 0.933\n", + "\n", + "Confusion Matrix\n", + "--------------------------------------------------\n", + "Predicted 0 1 2\n", + "Actual \n", + "0 15 0 0\n", + "1 2 15 1\n", + "2 0 0 12 \n", + "\n", + "Per-Class Metrics\n", + "--------------------------------------------------\n", + " Precision Recall\n", + "class_0 0.882 1.000\n", + "class_1 1.000 0.833\n", + "class_2 0.923 1.000\n", + "==================================================\n" + ] + } + ], + "source": [ + "# Final report\n", + "print(\"Assignment 1: Wine Dataset KNN Classification\\n\")\n", + "\n", + "# Dataset summary\n", + "print(\"Question 1: Dataset Summary\")\n", + "print(f\"Total observations (rows): {wine_df.shape[0]}\")\n", + "print(f\"Total variables (columns): {wine_df.shape[1]}\")\n", + "print(f\"Total features: {wine_df.shape[1] - 1}\")\n", + "print(f\"Target classes: {list(wine_data.target_names)}\")\n", + "print(f\"Variable type: {wine_df['class'].dtype}\") \n", + "print(f\"Unique value: {wine_df['class'].nunique()}\") \n", + "print(f\"Value(s): {wine_df['class'].unique()}\\n\") \n", + "\n", + "print(\"Question 2: Standardization and data-splitting\")\n", + "print(\"Train/Test Split\")\n", + "print(\"-\"*50)\n", + "print(f\"Training set size: {X_train.shape[0]} samples\")\n", + "print(f\"Test set size: {X_test.shape[0]} samples\\n\")\n", + "\n", + "print(\"Question 3: Cross-Validation & Hyperparameter Tuning\")\n", + "# Hyperparameter tuning results\n", + "optimal_k = grid_search.best_params_[\"n_neighbors\"]\n", + "print(\"Hyperparameter Tuning (10-fold Cross-Validation)\")\n", + "print(\"-\"*50)\n", + "print(f\"Optimal number of neighbors (k): {optimal_k}\")\n", + "print(f\"Mean CV Accuracy: {grid_search.best_score_:.3f}\\n\")\n", + "\n", + "\n", + "# Test performance\n", + "print(\"Question 4: Model Evaluation\")\n", + "test_accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"Performance on Test Set\")\n", + "print(\"-\"*50)\n", + "print(f\"Test Accuracy: {test_accuracy:.3f}\\n\")\n", + "\n", + "# Confusion Matrix\n", + "print(\"Confusion Matrix\")\n", + "print(\"-\"*50)\n", + "print(conf_matrix, \"\\n\")\n", + "\n", + "# Precision & Recall\n", + "print(\"Per-Class Metrics\")\n", + "print(\"-\"*50)\n", + "print(metrics_df)\n", + "print(\"=\"*50)\n" ] }, { @@ -354,10 +1203,10 @@ " * Open a private window in your browser. Copy and paste the link to your pull request into the address bar. Make sure you can see your pull request properly. This helps the technical facilitator and learning support staff review your submission easily.\n", "\n", "Checklist:\n", - "- [ ] Created a branch with the correct naming convention.\n", - "- [ ] Ensured that the repository is public.\n", - "- [ ] Reviewed the PR description guidelines and adhered to them.\n", - "- [ ] Verify that the link is accessible in a private browser window.\n", + "- [X] Created a branch with the correct naming convention.\n", + "- [X] Ensured that the repository is public.\n", + "- [X] Reviewed the PR description guidelines and adhered to them.\n", + "- [X] Verify that the link is accessible in a private browser window.\n", "\n", "If you encounter any difficulties or have questions, please don't hesitate to reach out to our team via our Slack at `#cohort-7-help`. Our Technical Facilitators and Learning Support staff are here to help you navigate any challenges.\n" ] @@ -365,7 +1214,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "dsi_participant", "language": "python", "name": "python3" }, @@ -379,12 +1228,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.12.3" } }, "nbformat": 4, diff --git a/04_this_cohort/live_code/live_code_02_27_2025.ipynb b/04_this_cohort/live_code/live_code_02_27_2025.ipynb index f5cc4479b..df4a6d64f 100644 --- a/04_this_cohort/live_code/live_code_02_27_2025.ipynb +++ b/04_this_cohort/live_code/live_code_02_27_2025.ipynb @@ -497,6 +497,13 @@ "cancer['diagnosis'].unique()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 9, @@ -511,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [ { diff --git a/04_this_cohort/live_code/live_code_09-02-25.ipynb b/04_this_cohort/live_code/live_code_09-02-25.ipynb new file mode 100644 index 000000000..9bc3cbb16 --- /dev/null +++ b/04_this_cohort/live_code/live_code_09-02-25.ipynb @@ -0,0 +1,3059 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "c25b5ba9", + "metadata": {}, + "outputs": [], + "source": [ + "#import our libraries\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "from mpl_toolkits import mplot3d\n", + "import os\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8ed98cfe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0842302M17.9910.38122.801001.00.118400.277600.300100.14710...25.38017.33184.602019.00.162200.665600.71190.26540.46010.11890
1842517M20.5717.77132.901326.00.084740.078640.086900.07017...24.99023.41158.801956.00.123800.186600.24160.18600.27500.08902
284300903M19.6921.25130.001203.00.109600.159900.197400.12790...23.57025.53152.501709.00.144400.424500.45040.24300.36130.08758
384348301M11.4220.3877.58386.10.142500.283900.241400.10520...14.91026.5098.87567.70.209800.866300.68690.25750.66380.17300
484358402M20.2914.34135.101297.00.100300.132800.198000.10430...22.54016.67152.201575.00.137400.205000.40000.16250.23640.07678
..................................................................
564926424M21.5622.39142.001479.00.111000.115900.243900.13890...25.45026.40166.102027.00.141000.211300.41070.22160.20600.07115
565926682M20.1328.25131.201261.00.097800.103400.144000.09791...23.69038.25155.001731.00.116600.192200.32150.16280.25720.06637
566926954M16.6028.08108.30858.10.084550.102300.092510.05302...18.98034.12126.701124.00.113900.309400.34030.14180.22180.07820
567927241M20.6029.33140.101265.00.117800.277000.351400.15200...25.74039.42184.601821.00.165000.868100.93870.26500.40870.12400
56892751B7.7624.5447.92181.00.052630.043620.000000.00000...9.45630.3759.16268.60.089960.064440.00000.00000.28710.07039
\n", + "

569 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", + "0 842302 M 17.99 10.38 122.80 1001.0 \n", + "1 842517 M 20.57 17.77 132.90 1326.0 \n", + "2 84300903 M 19.69 21.25 130.00 1203.0 \n", + "3 84348301 M 11.42 20.38 77.58 386.1 \n", + "4 84358402 M 20.29 14.34 135.10 1297.0 \n", + ".. ... ... ... ... ... ... \n", + "564 926424 M 21.56 22.39 142.00 1479.0 \n", + "565 926682 M 20.13 28.25 131.20 1261.0 \n", + "566 926954 M 16.60 28.08 108.30 858.1 \n", + "567 927241 M 20.60 29.33 140.10 1265.0 \n", + "568 92751 B 7.76 24.54 47.92 181.0 \n", + "\n", + " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", + "0 0.11840 0.27760 0.30010 0.14710 \n", + "1 0.08474 0.07864 0.08690 0.07017 \n", + "2 0.10960 0.15990 0.19740 0.12790 \n", + "3 0.14250 0.28390 0.24140 0.10520 \n", + "4 0.10030 0.13280 0.19800 0.10430 \n", + ".. ... ... ... ... \n", + "564 0.11100 0.11590 0.24390 0.13890 \n", + "565 0.09780 0.10340 0.14400 0.09791 \n", + "566 0.08455 0.10230 0.09251 0.05302 \n", + "567 0.11780 0.27700 0.35140 0.15200 \n", + "568 0.05263 0.04362 0.00000 0.00000 \n", + "\n", + " ... radius_worst texture_worst perimeter_worst area_worst \\\n", + "0 ... 25.380 17.33 184.60 2019.0 \n", + "1 ... 24.990 23.41 158.80 1956.0 \n", + "2 ... 23.570 25.53 152.50 1709.0 \n", + "3 ... 14.910 26.50 98.87 567.7 \n", + "4 ... 22.540 16.67 152.20 1575.0 \n", + ".. ... ... ... ... ... \n", + "564 ... 25.450 26.40 166.10 2027.0 \n", + "565 ... 23.690 38.25 155.00 1731.0 \n", + "566 ... 18.980 34.12 126.70 1124.0 \n", + "567 ... 25.740 39.42 184.60 1821.0 \n", + "568 ... 9.456 30.37 59.16 268.6 \n", + "\n", + " smoothness_worst compactness_worst concavity_worst \\\n", + "0 0.16220 0.66560 0.7119 \n", + "1 0.12380 0.18660 0.2416 \n", + "2 0.14440 0.42450 0.4504 \n", + "3 0.20980 0.86630 0.6869 \n", + "4 0.13740 0.20500 0.4000 \n", + ".. ... ... ... \n", + "564 0.14100 0.21130 0.4107 \n", + "565 0.11660 0.19220 0.3215 \n", + "566 0.11390 0.30940 0.3403 \n", + "567 0.16500 0.86810 0.9387 \n", + "568 0.08996 0.06444 0.0000 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + ".. ... ... ... \n", + "564 0.2216 0.2060 0.07115 \n", + "565 0.1628 0.2572 0.06637 \n", + "566 0.1418 0.2218 0.07820 \n", + "567 0.2650 0.4087 0.12400 \n", + "568 0.0000 0.2871 0.07039 \n", + "\n", + "[569 rows x 32 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer = pd.read_csv('/Users/vincent/dsi_lcr/LCR/01_materials/notebooks/dataset/wdbc.csv')\n", + "cancer\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "85d93b5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 569 entries, 0 to 568\n", + "Data columns (total 32 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 569 non-null int64 \n", + " 1 diagnosis 569 non-null object \n", + " 2 radius_mean 569 non-null float64\n", + " 3 texture_mean 569 non-null float64\n", + " 4 perimeter_mean 569 non-null float64\n", + " 5 area_mean 569 non-null float64\n", + " 6 smoothness_mean 569 non-null float64\n", + " 7 compactness_mean 569 non-null float64\n", + " 8 concavity_mean 569 non-null float64\n", + " 9 concave points_mean 569 non-null float64\n", + " 10 symmetry_mean 569 non-null float64\n", + " 11 fractal_dimension_mean 569 non-null float64\n", + " 12 radius_se 569 non-null float64\n", + " 13 texture_se 569 non-null float64\n", + " 14 perimeter_se 569 non-null float64\n", + " 15 area_se 569 non-null float64\n", + " 16 smoothness_se 569 non-null float64\n", + " 17 compactness_se 569 non-null float64\n", + " 18 concavity_se 569 non-null float64\n", + " 19 concave points_se 569 non-null float64\n", + " 20 symmetry_se 569 non-null float64\n", + " 21 fractal_dimension_se 569 non-null float64\n", + " 22 radius_worst 569 non-null float64\n", + " 23 texture_worst 569 non-null float64\n", + " 24 perimeter_worst 569 non-null float64\n", + " 25 area_worst 569 non-null float64\n", + " 26 smoothness_worst 569 non-null float64\n", + " 27 compactness_worst 569 non-null float64\n", + " 28 concavity_worst 569 non-null float64\n", + " 29 concave points_worst 569 non-null float64\n", + " 30 symmetry_worst 569 non-null float64\n", + " 31 fractal_dimension_worst 569 non-null float64\n", + "dtypes: float64(30), int64(1), object(1)\n", + "memory usage: 142.4+ KB\n" + ] + } + ], + "source": [ + "#Look at our data using .info()\n", + "cancer.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0e0d8c47", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n", + "['M' 'B']\n" + ] + } + ], + "source": [ + "#How many unique values are in the diagnosis column?\n", + "print(cancer['diagnosis'].nunique())\n", + "\n", + "#Output should be array(['M', 'B'], dtype=object)\n", + "print(cancer['diagnosis'].unique())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1cbf07dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Malignant', 'Benign'], dtype=object)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Replace M with Malignant, B with Benign\n", + "cancer['diagnosis'] = cancer['diagnosis'].replace({'M': 'Malignant', 'B': 'Benign'})\n", + "cancer['diagnosis'].unique()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c87d6c1c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "diagnosis\n", + "Benign 357\n", + "Malignant 212\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer['diagnosis'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f8946340", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "diagnosis\n", + "Benign 0.627417\n", + "Malignant 0.372583\n", + "Name: proportion, dtype: float64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer['diagnosis'].value_counts(normalize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "e199dc29", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Now, let’s create a scatter plot to visualize the relationship between perimeter mean and concavity mean. This will help us see how these features relate to whether a tumor is benign or malignant.\n", + "\n", + "# Create mapping between values and colors\n", + "labels = cancer[\"diagnosis\"].unique().tolist() #['Malignant', 'Benign']\n", + "colors = list(mcolors.TABLEAU_COLORS.keys()) #['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']\n", + "color_map = {l: colors[i % len(colors)] for i, l in enumerate(labels)} #{'Malignant': 'tab:blue', 'Benign': 'tab:orange'}\n", + "\n", + "# Plot\n", + "# Scatter plot of perimeter_mean vs concavity_mean, colored by diagnosis\n", + "plt.scatter(cancer[\"perimeter_mean\"], cancer['concavity_mean'], \n", + " color=cancer[\"diagnosis\"].map(color_map))\n", + "\n", + "# Create custom legend handles\n", + "# Plot legend\n", + "handles = [plt.Line2D([0], [0], marker='o', color='w', label=label,\n", + " markersize=10, markerfacecolor=color_map[label])\n", + " for label in labels]\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Perimeter Mean')\n", + "plt.ylabel('Concavity Mean')\n", + "plt.title('Scatter Plot of Perimeter Mean vs Concavity Mean')\n", + "plt.legend(handles=handles, title='Diagnosis')\n", + "plt.show()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e7f88d77", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot existing data\n", + "plt.scatter(cancer[\"perimeter_mean\"], cancer['concavity_mean'], \n", + " color=cancer[\"diagnosis\"].map(color_map))\n", + "\n", + "# Create custom legend handles\n", + "handles = [plt.Line2D([0], [0], marker='o', color='w', label=label,\n", + " markersize=10, markerfacecolor=color_map[label])\n", + " for label in labels]\n", + "\n", + "# Add new observation\n", + "new_observation = {'perimeter_mean': 97, 'concavity_mean': 0.20}\n", + "plt.scatter(new_observation['perimeter_mean'], new_observation['concavity_mean'],\n", + " color='red', edgecolor='black', s=100, label='New Observation')\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Perimeter Mean')\n", + "plt.ylabel('Concavity Mean')\n", + "plt.title('Scatter Plot of Perimeter Mean vs Concavity Mean')\n", + "plt.legend(handles=handles + [plt.Line2D([0], [0], marker='o', color='w', \n", + " markerfacecolor='red', markeredgecolor='black', \n", + " markersize=10, label='New Observation')], \n", + " title='Diagnosis')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3cb834c9", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "new_obs_Perimeter = 97\n", + "new_obs_Concavity = 0.20\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d12c9452", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "cancer[\"dist_from_new\"] = ((cancer['perimeter_mean'] - new_obs_Perimeter)**2 + \n", + "(cancer['concavity_mean'] - new_obs_Concavity)**2) ** (1/2)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6bc21a95", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...texture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worstdist_from_new
0842302Malignant17.9910.38122.801001.00.118400.277600.300100.14710...17.33184.602019.00.162200.665600.71190.26540.46010.1189025.800194
1842517Malignant20.5717.77132.901326.00.084740.078640.086900.07017...23.41158.801956.00.123800.186600.24160.18600.27500.0890235.900178
284300903Malignant19.6921.25130.001203.00.109600.159900.197400.12790...25.53152.501709.00.144400.424500.45040.24300.36130.0875833.000000
384348301Malignant11.4220.3877.58386.10.142500.283900.241400.10520...26.5098.87567.70.209800.866300.68690.25750.66380.1730019.420044
484358402Malignant20.2914.34135.101297.00.100300.132800.198000.10430...16.67152.201575.00.137400.205000.40000.16250.23640.0767838.100000
..................................................................
564926424Malignant21.5622.39142.001479.00.111000.115900.243900.13890...26.40166.102027.00.141000.211300.41070.22160.20600.0711545.000021
565926682Malignant20.1328.25131.201261.00.097800.103400.144000.09791...38.25155.001731.00.116600.192200.32150.16280.25720.0663734.200046
566926954Malignant16.6028.08108.30858.10.084550.102300.092510.05302...34.12126.701124.00.113900.309400.34030.14180.22180.0782011.300511
567927241Malignant20.6029.33140.101265.00.117800.277000.351400.15200...39.42184.601821.00.165000.868100.93870.26500.40870.1240043.100266
56892751Benign7.7624.5447.92181.00.052630.043620.000000.00000...30.3759.16268.60.089960.064440.00000.00000.28710.0703949.080407
\n", + "

569 rows × 33 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean \\\n", + "0 842302 Malignant 17.99 10.38 122.80 \n", + "1 842517 Malignant 20.57 17.77 132.90 \n", + "2 84300903 Malignant 19.69 21.25 130.00 \n", + "3 84348301 Malignant 11.42 20.38 77.58 \n", + "4 84358402 Malignant 20.29 14.34 135.10 \n", + ".. ... ... ... ... ... \n", + "564 926424 Malignant 21.56 22.39 142.00 \n", + "565 926682 Malignant 20.13 28.25 131.20 \n", + "566 926954 Malignant 16.60 28.08 108.30 \n", + "567 927241 Malignant 20.60 29.33 140.10 \n", + "568 92751 Benign 7.76 24.54 47.92 \n", + "\n", + " area_mean smoothness_mean compactness_mean concavity_mean \\\n", + "0 1001.0 0.11840 0.27760 0.30010 \n", + "1 1326.0 0.08474 0.07864 0.08690 \n", + "2 1203.0 0.10960 0.15990 0.19740 \n", + "3 386.1 0.14250 0.28390 0.24140 \n", + "4 1297.0 0.10030 0.13280 0.19800 \n", + ".. ... ... ... ... \n", + "564 1479.0 0.11100 0.11590 0.24390 \n", + "565 1261.0 0.09780 0.10340 0.14400 \n", + "566 858.1 0.08455 0.10230 0.09251 \n", + "567 1265.0 0.11780 0.27700 0.35140 \n", + "568 181.0 0.05263 0.04362 0.00000 \n", + "\n", + " concave points_mean ... texture_worst perimeter_worst area_worst \\\n", + "0 0.14710 ... 17.33 184.60 2019.0 \n", + "1 0.07017 ... 23.41 158.80 1956.0 \n", + "2 0.12790 ... 25.53 152.50 1709.0 \n", + "3 0.10520 ... 26.50 98.87 567.7 \n", + "4 0.10430 ... 16.67 152.20 1575.0 \n", + ".. ... ... ... ... ... \n", + "564 0.13890 ... 26.40 166.10 2027.0 \n", + "565 0.09791 ... 38.25 155.00 1731.0 \n", + "566 0.05302 ... 34.12 126.70 1124.0 \n", + "567 0.15200 ... 39.42 184.60 1821.0 \n", + "568 0.00000 ... 30.37 59.16 268.6 \n", + "\n", + " smoothness_worst compactness_worst concavity_worst \\\n", + "0 0.16220 0.66560 0.7119 \n", + "1 0.12380 0.18660 0.2416 \n", + "2 0.14440 0.42450 0.4504 \n", + "3 0.20980 0.86630 0.6869 \n", + "4 0.13740 0.20500 0.4000 \n", + ".. ... ... ... \n", + "564 0.14100 0.21130 0.4107 \n", + "565 0.11660 0.19220 0.3215 \n", + "566 0.11390 0.30940 0.3403 \n", + "567 0.16500 0.86810 0.9387 \n", + "568 0.08996 0.06444 0.0000 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \\\n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + ".. ... ... ... \n", + "564 0.2216 0.2060 0.07115 \n", + "565 0.1628 0.2572 0.06637 \n", + "566 0.1418 0.2218 0.07820 \n", + "567 0.2650 0.4087 0.12400 \n", + "568 0.0000 0.2871 0.07039 \n", + "\n", + " dist_from_new \n", + "0 25.800194 \n", + "1 35.900178 \n", + "2 33.000000 \n", + "3 19.420044 \n", + "4 38.100000 \n", + ".. ... \n", + "564 45.000021 \n", + "565 34.200046 \n", + "566 11.300511 \n", + "567 43.100266 \n", + "568 49.080407 \n", + "\n", + "[569 rows x 33 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bcac4cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
perimeter_meanconcavity_meandiagnosisdist_from_new
29197.030.05940Benign0.143765
13896.850.15390Malignant0.156924
1596.730.16390Malignant0.272403
51497.260.07486Malignant0.288548
5497.260.05253Malignant0.298910
\n", + "
" + ], + "text/plain": [ + " perimeter_mean concavity_mean diagnosis dist_from_new\n", + "291 97.03 0.05940 Benign 0.143765\n", + "138 96.85 0.15390 Malignant 0.156924\n", + "15 96.73 0.16390 Malignant 0.272403\n", + "514 97.26 0.07486 Malignant 0.288548\n", + "54 97.26 0.05253 Malignant 0.298910" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Find the 5 closest points to our new observation, and look at the perimter mean, concavity mean, diagnosis, distance from new\n", + "cancer.nsmallest(5, 'dist_from_new')[['perimeter_mean', 'concavity_mean', 'diagnosis', 'dist_from_new']] \n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "1cd11e6e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Incorporate more features to improve accuracy\n", + "#Include symmetry_mean as a third dimension\n", + "\n", + "# Create mapping between values and colors\n", + "labels = cancer[\"diagnosis\"].unique().tolist()\n", + "colors = list(mcolors.TABLEAU_COLORS.keys())\n", + "color_map = {l: colors[i % len(colors)] for i, l in enumerate(labels)}\n", + "\n", + "# Create a 3D plot\n", + "ax = plt.axes(projection=\"3d\")\n", + "\n", + "# Plot data points with color corresponding to diagnosis\n", + "sc = ax.scatter3D(cancer['perimeter_mean'], cancer['concavity_mean'], cancer['symmetry_mean'], \n", + " c=cancer['diagnosis'].map(color_map), marker='o')\n", + "\n", + "# Define the new observation\n", + "new_observation = {'perimeter_mean': 97, 'concavity_mean': 0.20, 'symmetry_mean': 0.22}\n", + "\n", + "# Plot the new observation\n", + "ax.scatter3D(new_observation['perimeter_mean'], new_observation['concavity_mean'], \n", + " new_observation['symmetry_mean'], color='red', edgecolor='black', \n", + " s=100, marker='o', label='New Observation')\n", + "\n", + "# Add axis labels\n", + "ax.set_xlabel('Perimeter Mean')\n", + "ax.set_ylabel('Concavity Mean')\n", + "ax.set_zlabel('Symmetry Mean')\n", + "ax.set_title('3D Scatter Plot of Perimeter Mean, Concavity Mean, and Symmetry Mean')\n", + "\n", + "# Create custom legend handles\n", + "handles = [plt.Line2D([0], [0], marker='o', color='w', label=label,\n", + " markersize=10, markerfacecolor=color_map[label])\n", + " for label in labels]\n", + "\n", + "# Add custom legend for new observation\n", + "handles.append(plt.Line2D([0], [0], marker='o', color='red', label='New Observation', \n", + " markersize=10, markeredgecolor='black'))\n", + "\n", + "# Add legend\n", + "plt.legend(handles=handles, title='Diagnosis')\n", + "\n", + "# Show plot\n", + "plt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "900d79d7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
perimeter_meanconcavity_meansymmetry_meandiagnosisdist_from_new
29197.030.059400.1879Benign0.147305
13896.850.153900.1957Malignant0.158795
1596.730.163900.2303Malignant0.272597
51497.260.074860.1561Malignant0.295539
5497.260.052530.1616Malignant0.304562
\n", + "
" + ], + "text/plain": [ + " perimeter_mean concavity_mean symmetry_mean diagnosis dist_from_new\n", + "291 97.03 0.05940 0.1879 Benign 0.147305\n", + "138 96.85 0.15390 0.1957 Malignant 0.158795\n", + "15 96.73 0.16390 0.2303 Malignant 0.272597\n", + "514 97.26 0.07486 0.1561 Malignant 0.295539\n", + "54 97.26 0.05253 0.1616 Malignant 0.304562" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#New observation of perimeter_mean = 97, concavity_mean = 0.20, symmetry_mean = 0.22\n", + "\n", + "new_obs_Perimeter = 97\n", + "new_obs_Concavity = 0.20\n", + "new_obs_Symmetry = 0.22\n", + "\n", + "cancer[\"dist_from_new\"] = ((cancer['perimeter_mean'] - new_obs_Perimeter)**2 + \n", + "(cancer['concavity_mean'] - new_obs_Concavity)**2 +\n", + "(cancer['symmetry_mean'] - new_obs_Symmetry)**2) ** (1/2)\n", + "\n", + "#Find the 5 closest points to our new observation, and look at the perimter mean, concavity mean, diagnosis, distance from new\n", + "cancer.nsmallest(5, 'dist_from_new')[['perimeter_mean', 'concavity_mean', 'symmetry_mean', 'diagnosis', 'dist_from_new']]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "eee634be", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 25.800203\n", + "1 35.900199\n", + "2 33.000003\n", + "3 19.420085\n", + "4 38.100020\n", + " ... \n", + "564 45.000046\n", + "565 34.200075\n", + "566 11.300676\n", + "567 43.100270\n", + "568 49.080446\n", + "Name: dist_from_new, Length: 569, dtype: float64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer['dist_from_new']" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "7bc1c8f0", + "metadata": {}, + "outputs": [], + "source": [ + "nearest_5 = cancer.nsmallest(5, \"dist_from_new\")[[\n", + " \"perimeter_mean\",\n", + " \"concavity_mean\",\n", + " \"symmetry_mean\",\n", + " \"diagnosis\",\n", + " \"dist_from_new\"\n", + "]]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "36637a59", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
perimeter_meanconcavity_meansymmetry_meandiagnosisdist_from_new
29197.030.059400.1879Benign0.147305
13896.850.153900.1957Malignant0.158795
1596.730.163900.2303Malignant0.272597
51497.260.074860.1561Malignant0.295539
5497.260.052530.1616Malignant0.304562
\n", + "
" + ], + "text/plain": [ + " perimeter_mean concavity_mean symmetry_mean diagnosis dist_from_new\n", + "291 97.03 0.05940 0.1879 Benign 0.147305\n", + "138 96.85 0.15390 0.1957 Malignant 0.158795\n", + "15 96.73 0.16390 0.2303 Malignant 0.272597\n", + "514 97.26 0.07486 0.1561 Malignant 0.295539\n", + "54 97.26 0.05253 0.1616 Malignant 0.304562" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nearest_5" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "39076891", + "metadata": {}, + "outputs": [], + "source": [ + "#import sklearn's KNeighborsClassifier\n", + "from sklearn import set_config\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "baa0f1e0", + "metadata": {}, + "outputs": [], + "source": [ + "#Output dataframes instead of arrays\n", + "set_config(transform_output=\"pandas\") # Other option is 'text'" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "a2d7cdb4", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "b320e010", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
diagnosisperimeter_meanconcavity_mean
0Malignant122.800.30010
1Malignant132.900.08690
2Malignant130.000.19740
3Malignant77.580.24140
4Malignant135.100.19800
............
564Malignant142.000.24390
565Malignant131.200.14400
566Malignant108.300.09251
567Malignant140.100.35140
568Benign47.920.00000
\n", + "

569 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " diagnosis perimeter_mean concavity_mean\n", + "0 Malignant 122.80 0.30010\n", + "1 Malignant 132.90 0.08690\n", + "2 Malignant 130.00 0.19740\n", + "3 Malignant 77.58 0.24140\n", + "4 Malignant 135.10 0.19800\n", + ".. ... ... ...\n", + "564 Malignant 142.00 0.24390\n", + "565 Malignant 131.20 0.14400\n", + "566 Malignant 108.30 0.09251\n", + "567 Malignant 140.10 0.35140\n", + "568 Benign 47.92 0.00000\n", + "\n", + "[569 rows x 3 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer_train = cancer[[\"diagnosis\", \"perimeter_mean\", \"concavity_mean\"]]\n", + "cancer_train\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "1c31219d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier()" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn = KNeighborsClassifier(n_neighbors=5)\n", + "knn" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "fbf5c0c2", + "metadata": {}, + "outputs": [], + "source": [ + "#define predictors and response variable\n", + "X = cancer_train[[\"perimeter_mean\", \"concavity_mean\"]]\n", + "y = cancer_train[\"diagnosis\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18bd9018", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier()" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn.fit(X, y) #We are fitting the model to our data" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "e86da175", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
perimeter_meanconcavity_mean
0970.2
\n", + "
" + ], + "text/plain": [ + " perimeter_mean concavity_mean\n", + "0 97 0.2" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_obs = pd.DataFrame({\"perimeter_mean\" :[97],\n", + " \"concavity_mean\": [0.20]})\n", + "new_obs" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "97aad866", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Malignant'], dtype=object)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Predict the diagnosis for our new observation\n", + "knn.predict(new_obs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dsi_participant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/04_this_cohort/live_code/live_code_09-03-25.ipynb b/04_this_cohort/live_code/live_code_09-03-25.ipynb new file mode 100644 index 000000000..a3c525047 --- /dev/null +++ b/04_this_cohort/live_code/live_code_09-03-25.ipynb @@ -0,0 +1,6845 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8b7a39f4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import recall_score, precision_score\n", + "from sklearn.model_selection import cross_validate\n", + "from sklearn.model_selection import GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bf78297c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0842302M17.9910.38122.801001.00.118400.277600.300100.14710...25.38017.33184.602019.00.162200.665600.71190.26540.46010.11890
1842517M20.5717.77132.901326.00.084740.078640.086900.07017...24.99023.41158.801956.00.123800.186600.24160.18600.27500.08902
284300903M19.6921.25130.001203.00.109600.159900.197400.12790...23.57025.53152.501709.00.144400.424500.45040.24300.36130.08758
384348301M11.4220.3877.58386.10.142500.283900.241400.10520...14.91026.5098.87567.70.209800.866300.68690.25750.66380.17300
484358402M20.2914.34135.101297.00.100300.132800.198000.10430...22.54016.67152.201575.00.137400.205000.40000.16250.23640.07678
..................................................................
564926424M21.5622.39142.001479.00.111000.115900.243900.13890...25.45026.40166.102027.00.141000.211300.41070.22160.20600.07115
565926682M20.1328.25131.201261.00.097800.103400.144000.09791...23.69038.25155.001731.00.116600.192200.32150.16280.25720.06637
566926954M16.6028.08108.30858.10.084550.102300.092510.05302...18.98034.12126.701124.00.113900.309400.34030.14180.22180.07820
567927241M20.6029.33140.101265.00.117800.277000.351400.15200...25.74039.42184.601821.00.165000.868100.93870.26500.40870.12400
56892751B7.7624.5447.92181.00.052630.043620.000000.00000...9.45630.3759.16268.60.089960.064440.00000.00000.28710.07039
\n", + "

569 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", + "0 842302 M 17.99 10.38 122.80 1001.0 \n", + "1 842517 M 20.57 17.77 132.90 1326.0 \n", + "2 84300903 M 19.69 21.25 130.00 1203.0 \n", + "3 84348301 M 11.42 20.38 77.58 386.1 \n", + "4 84358402 M 20.29 14.34 135.10 1297.0 \n", + ".. ... ... ... ... ... ... \n", + "564 926424 M 21.56 22.39 142.00 1479.0 \n", + "565 926682 M 20.13 28.25 131.20 1261.0 \n", + "566 926954 M 16.60 28.08 108.30 858.1 \n", + "567 927241 M 20.60 29.33 140.10 1265.0 \n", + "568 92751 B 7.76 24.54 47.92 181.0 \n", + "\n", + " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", + "0 0.11840 0.27760 0.30010 0.14710 \n", + "1 0.08474 0.07864 0.08690 0.07017 \n", + "2 0.10960 0.15990 0.19740 0.12790 \n", + "3 0.14250 0.28390 0.24140 0.10520 \n", + "4 0.10030 0.13280 0.19800 0.10430 \n", + ".. ... ... ... ... \n", + "564 0.11100 0.11590 0.24390 0.13890 \n", + "565 0.09780 0.10340 0.14400 0.09791 \n", + "566 0.08455 0.10230 0.09251 0.05302 \n", + "567 0.11780 0.27700 0.35140 0.15200 \n", + "568 0.05263 0.04362 0.00000 0.00000 \n", + "\n", + " ... radius_worst texture_worst perimeter_worst area_worst \\\n", + "0 ... 25.380 17.33 184.60 2019.0 \n", + "1 ... 24.990 23.41 158.80 1956.0 \n", + "2 ... 23.570 25.53 152.50 1709.0 \n", + "3 ... 14.910 26.50 98.87 567.7 \n", + "4 ... 22.540 16.67 152.20 1575.0 \n", + ".. ... ... ... ... ... \n", + "564 ... 25.450 26.40 166.10 2027.0 \n", + "565 ... 23.690 38.25 155.00 1731.0 \n", + "566 ... 18.980 34.12 126.70 1124.0 \n", + "567 ... 25.740 39.42 184.60 1821.0 \n", + "568 ... 9.456 30.37 59.16 268.6 \n", + "\n", + " smoothness_worst compactness_worst concavity_worst \\\n", + "0 0.16220 0.66560 0.7119 \n", + "1 0.12380 0.18660 0.2416 \n", + "2 0.14440 0.42450 0.4504 \n", + "3 0.20980 0.86630 0.6869 \n", + "4 0.13740 0.20500 0.4000 \n", + ".. ... ... ... \n", + "564 0.14100 0.21130 0.4107 \n", + "565 0.11660 0.19220 0.3215 \n", + "566 0.11390 0.30940 0.3403 \n", + "567 0.16500 0.86810 0.9387 \n", + "568 0.08996 0.06444 0.0000 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + ".. ... ... ... \n", + "564 0.2216 0.2060 0.07115 \n", + "565 0.1628 0.2572 0.06637 \n", + "566 0.1418 0.2218 0.07820 \n", + "567 0.2650 0.4087 0.12400 \n", + "568 0.0000 0.2871 0.07039 \n", + "\n", + "[569 rows x 32 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer = pd.read_csv('/Users/vincent/dsi_lcr/LCR/01_materials/notebooks/dataset/wdbc.csv')\n", + "cancer" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1a13f212", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0842302Malignant17.9910.38122.801001.00.118400.277600.300100.14710...25.38017.33184.602019.00.162200.665600.71190.26540.46010.11890
1842517Malignant20.5717.77132.901326.00.084740.078640.086900.07017...24.99023.41158.801956.00.123800.186600.24160.18600.27500.08902
284300903Malignant19.6921.25130.001203.00.109600.159900.197400.12790...23.57025.53152.501709.00.144400.424500.45040.24300.36130.08758
384348301Malignant11.4220.3877.58386.10.142500.283900.241400.10520...14.91026.5098.87567.70.209800.866300.68690.25750.66380.17300
484358402Malignant20.2914.34135.101297.00.100300.132800.198000.10430...22.54016.67152.201575.00.137400.205000.40000.16250.23640.07678
..................................................................
564926424Malignant21.5622.39142.001479.00.111000.115900.243900.13890...25.45026.40166.102027.00.141000.211300.41070.22160.20600.07115
565926682Malignant20.1328.25131.201261.00.097800.103400.144000.09791...23.69038.25155.001731.00.116600.192200.32150.16280.25720.06637
566926954Malignant16.6028.08108.30858.10.084550.102300.092510.05302...18.98034.12126.701124.00.113900.309400.34030.14180.22180.07820
567927241Malignant20.6029.33140.101265.00.117800.277000.351400.15200...25.74039.42184.601821.00.165000.868100.93870.26500.40870.12400
56892751Benign7.7624.5447.92181.00.052630.043620.000000.00000...9.45630.3759.16268.60.089960.064440.00000.00000.28710.07039
\n", + "

569 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean \\\n", + "0 842302 Malignant 17.99 10.38 122.80 \n", + "1 842517 Malignant 20.57 17.77 132.90 \n", + "2 84300903 Malignant 19.69 21.25 130.00 \n", + "3 84348301 Malignant 11.42 20.38 77.58 \n", + "4 84358402 Malignant 20.29 14.34 135.10 \n", + ".. ... ... ... ... ... \n", + "564 926424 Malignant 21.56 22.39 142.00 \n", + "565 926682 Malignant 20.13 28.25 131.20 \n", + "566 926954 Malignant 16.60 28.08 108.30 \n", + "567 927241 Malignant 20.60 29.33 140.10 \n", + "568 92751 Benign 7.76 24.54 47.92 \n", + "\n", + " area_mean smoothness_mean compactness_mean concavity_mean \\\n", + "0 1001.0 0.11840 0.27760 0.30010 \n", + "1 1326.0 0.08474 0.07864 0.08690 \n", + "2 1203.0 0.10960 0.15990 0.19740 \n", + "3 386.1 0.14250 0.28390 0.24140 \n", + "4 1297.0 0.10030 0.13280 0.19800 \n", + ".. ... ... ... ... \n", + "564 1479.0 0.11100 0.11590 0.24390 \n", + "565 1261.0 0.09780 0.10340 0.14400 \n", + "566 858.1 0.08455 0.10230 0.09251 \n", + "567 1265.0 0.11780 0.27700 0.35140 \n", + "568 181.0 0.05263 0.04362 0.00000 \n", + "\n", + " concave points_mean ... radius_worst texture_worst perimeter_worst \\\n", + "0 0.14710 ... 25.380 17.33 184.60 \n", + "1 0.07017 ... 24.990 23.41 158.80 \n", + "2 0.12790 ... 23.570 25.53 152.50 \n", + "3 0.10520 ... 14.910 26.50 98.87 \n", + "4 0.10430 ... 22.540 16.67 152.20 \n", + ".. ... ... ... ... ... \n", + "564 0.13890 ... 25.450 26.40 166.10 \n", + "565 0.09791 ... 23.690 38.25 155.00 \n", + "566 0.05302 ... 18.980 34.12 126.70 \n", + "567 0.15200 ... 25.740 39.42 184.60 \n", + "568 0.00000 ... 9.456 30.37 59.16 \n", + "\n", + " area_worst smoothness_worst compactness_worst concavity_worst \\\n", + "0 2019.0 0.16220 0.66560 0.7119 \n", + "1 1956.0 0.12380 0.18660 0.2416 \n", + "2 1709.0 0.14440 0.42450 0.4504 \n", + "3 567.7 0.20980 0.86630 0.6869 \n", + "4 1575.0 0.13740 0.20500 0.4000 \n", + ".. ... ... ... ... \n", + "564 2027.0 0.14100 0.21130 0.4107 \n", + "565 1731.0 0.11660 0.19220 0.3215 \n", + "566 1124.0 0.11390 0.30940 0.3403 \n", + "567 1821.0 0.16500 0.86810 0.9387 \n", + "568 268.6 0.08996 0.06444 0.0000 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + ".. ... ... ... \n", + "564 0.2216 0.2060 0.07115 \n", + "565 0.1628 0.2572 0.06637 \n", + "566 0.1418 0.2218 0.07820 \n", + "567 0.2650 0.4087 0.12400 \n", + "568 0.0000 0.2871 0.07039 \n", + "\n", + "[569 rows x 32 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cancer['diagnosis'] = cancer['diagnosis'].replace({\n", + " \"M\": \"Malignant\",\n", + " \"B\" :\"Benign\"\n", + "})\n", + "\n", + "cancer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d2ae588", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "standardized_cancer = cancer.copy() #Creating a copy of the original dataframe\n", + "\n", + "#Exclude the ID and diagnosis columns from standardization\n", + "columns_to_exclude = ['id', 'diagnosis']\n", + "\n", + "#Selecting columns to standardize \n", + "columns_to_scale = standardized_cancer.columns.difference(columns_to_exclude) \n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "eb9a976a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
0842302Malignant1.097064-2.0733351.2699340.9843751.5684663.2835152.6528742.532475...1.886690-1.3592932.3036012.0012371.3076862.6166652.1095262.2960762.7506221.937015
1842517Malignant1.829821-0.3536321.6859551.908708-0.826962-0.487072-0.0238460.548144...1.805927-0.3692031.5351261.890489-0.375612-0.430444-0.1467491.087084-0.2438900.281190
284300903Malignant1.5798880.4561871.5665031.5588840.9422101.0529261.3634782.037231...1.511870-0.0239741.3474751.4562850.5274071.0829320.8549741.9550001.1522550.201391
384348301Malignant-0.7689090.253732-0.592687-0.7644643.2835533.4029091.9158971.451707...-0.2814640.133984-0.249939-0.5500213.3942753.8933971.9895882.1757866.0460414.935010
484358402Malignant1.750297-1.1518161.7765731.8262290.2803720.5393401.3710111.428493...1.298575-1.4667701.3385391.2207240.220556-0.3133950.6131790.729259-0.868353-0.397100
..................................................................
564926424Malignant2.1109950.7214732.0607862.3438561.0418420.2190601.9472852.320965...1.9011850.1177001.7525632.0153010.378365-0.2733180.6645121.629151-1.360158-0.709091
565926682Malignant1.7048542.0851341.6159311.7238420.102458-0.0178330.6930431.263669...1.5367202.0473991.4219401.494959-0.691230-0.3948200.2365730.733827-0.531855-0.973978
566926954Malignant0.7022842.0455740.6726760.577953-0.840484-0.0386800.0465880.105777...0.5613611.3748540.5790010.427906-0.8095870.3507350.3267670.414069-1.104549-0.318409
567927241Malignant1.8383412.3364571.9825241.7352181.5257673.2721443.2969442.658866...1.9612392.2379262.3036011.6531711.4304273.9048483.1976052.2899851.9190832.219635
56892751Benign-1.8084011.221792-1.814389-1.347789-3.112085-1.150752-1.114873-1.261820...-1.4108930.764190-1.432735-1.075813-1.859019-1.207552-1.305831-1.745063-0.048138-0.751207
\n", + "

569 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean \\\n", + "0 842302 Malignant 1.097064 -2.073335 1.269934 \n", + "1 842517 Malignant 1.829821 -0.353632 1.685955 \n", + "2 84300903 Malignant 1.579888 0.456187 1.566503 \n", + "3 84348301 Malignant -0.768909 0.253732 -0.592687 \n", + "4 84358402 Malignant 1.750297 -1.151816 1.776573 \n", + ".. ... ... ... ... ... \n", + "564 926424 Malignant 2.110995 0.721473 2.060786 \n", + "565 926682 Malignant 1.704854 2.085134 1.615931 \n", + "566 926954 Malignant 0.702284 2.045574 0.672676 \n", + "567 927241 Malignant 1.838341 2.336457 1.982524 \n", + "568 92751 Benign -1.808401 1.221792 -1.814389 \n", + "\n", + " area_mean smoothness_mean compactness_mean concavity_mean \\\n", + "0 0.984375 1.568466 3.283515 2.652874 \n", + "1 1.908708 -0.826962 -0.487072 -0.023846 \n", + "2 1.558884 0.942210 1.052926 1.363478 \n", + "3 -0.764464 3.283553 3.402909 1.915897 \n", + "4 1.826229 0.280372 0.539340 1.371011 \n", + ".. ... ... ... ... \n", + "564 2.343856 1.041842 0.219060 1.947285 \n", + "565 1.723842 0.102458 -0.017833 0.693043 \n", + "566 0.577953 -0.840484 -0.038680 0.046588 \n", + "567 1.735218 1.525767 3.272144 3.296944 \n", + "568 -1.347789 -3.112085 -1.150752 -1.114873 \n", + "\n", + " concave points_mean ... radius_worst texture_worst perimeter_worst \\\n", + "0 2.532475 ... 1.886690 -1.359293 2.303601 \n", + "1 0.548144 ... 1.805927 -0.369203 1.535126 \n", + "2 2.037231 ... 1.511870 -0.023974 1.347475 \n", + "3 1.451707 ... -0.281464 0.133984 -0.249939 \n", + "4 1.428493 ... 1.298575 -1.466770 1.338539 \n", + ".. ... ... ... ... ... \n", + "564 2.320965 ... 1.901185 0.117700 1.752563 \n", + "565 1.263669 ... 1.536720 2.047399 1.421940 \n", + "566 0.105777 ... 0.561361 1.374854 0.579001 \n", + "567 2.658866 ... 1.961239 2.237926 2.303601 \n", + "568 -1.261820 ... -1.410893 0.764190 -1.432735 \n", + "\n", + " area_worst smoothness_worst compactness_worst concavity_worst \\\n", + "0 2.001237 1.307686 2.616665 2.109526 \n", + "1 1.890489 -0.375612 -0.430444 -0.146749 \n", + "2 1.456285 0.527407 1.082932 0.854974 \n", + "3 -0.550021 3.394275 3.893397 1.989588 \n", + "4 1.220724 0.220556 -0.313395 0.613179 \n", + ".. ... ... ... ... \n", + "564 2.015301 0.378365 -0.273318 0.664512 \n", + "565 1.494959 -0.691230 -0.394820 0.236573 \n", + "566 0.427906 -0.809587 0.350735 0.326767 \n", + "567 1.653171 1.430427 3.904848 3.197605 \n", + "568 -1.075813 -1.859019 -1.207552 -1.305831 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "0 2.296076 2.750622 1.937015 \n", + "1 1.087084 -0.243890 0.281190 \n", + "2 1.955000 1.152255 0.201391 \n", + "3 2.175786 6.046041 4.935010 \n", + "4 0.729259 -0.868353 -0.397100 \n", + ".. ... ... ... \n", + "564 1.629151 -1.360158 -0.709091 \n", + "565 0.733827 -0.531855 -0.973978 \n", + "566 0.414069 -1.104549 -0.318409 \n", + "567 2.289985 1.919083 2.219635 \n", + "568 -1.745063 -0.048138 -0.751207 \n", + "\n", + "[569 rows x 32 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scaler = StandardScaler()\n", + "standardized_cancer[columns_to_scale] = scaler.fit_transform(standardized_cancer[columns_to_scale])\n", + "standardized_cancer\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "790e5afe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst
1648712289Malignant2.5966590.6400252.4768072.932585-0.8518700.1925280.5474051.240713...2.4313170.4140752.2916862.676276-0.4194480.6618080.5882321.8270971.1134280.439125
28852973Malignant0.3330661.3916680.4296540.2204490.8425791.2386500.9981290.995412...0.8284981.7966191.2521610.6828031.3909742.2693331.7334011.3368001.8220160.820940
3789013594Benign-0.132717-0.963324-0.152364-0.211286-0.973563-0.546958-0.581412-0.624450...-0.358085-0.983124-0.277044-0.393040-0.2134190.357097-0.073347-0.1401790.7866370.689050
1318670Malignant0.3785080.0442960.4008200.2673770.9137440.3403500.7256860.824140...0.6193450.0525620.5253860.4841590.974533-0.0945620.5129110.560244-0.103143-0.208132
23388206102Malignant1.8127801.9827431.7477401.888800-0.3394790.0579730.8361700.889399...1.6982451.9057251.6512911.742824-0.4413660.1389010.6832230.634854-0.750255-0.036897
..................................................................
360901034302Benign-0.450813-0.283820-0.516897-0.463558-1.565660-1.475202-1.099882-1.121268...-0.527893-0.764914-0.608859-0.518379-1.728826-1.342223-1.288651-1.496108-1.080282-1.592419
301892604Benign-0.4735350.139706-0.475295-0.522146-0.843330-0.055736-0.257368-0.462464...-0.581734-0.424570-0.569839-0.578851-1.199727-0.244691-0.392382-0.584035-0.349045-0.349442
406905189Benign0.571638-1.0308090.5079150.412710-0.100363-0.366351-0.424349-0.093868...0.298367-0.9928950.2573140.118337-0.515887-0.522048-0.197603-0.025980-0.198592-0.766169
27852781Malignant1.2731530.2234801.2411011.248876-0.1395040.0428120.7558180.732313...1.0438640.2577450.9721740.9183630.062747-0.2707730.3473960.523700-0.905562-0.539518
2848912284Benign-0.351408-0.835335-0.324951-0.393308-1.293808-0.1618640.285006-0.387404...-0.490618-0.974982-0.450994-0.500975-1.451345-0.1435450.298461-0.196518-1.458843-0.702441
\n", + "

426 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis radius_mean texture_mean perimeter_mean \\\n", + "164 8712289 Malignant 2.596659 0.640025 2.476807 \n", + "28 852973 Malignant 0.333066 1.391668 0.429654 \n", + "378 9013594 Benign -0.132717 -0.963324 -0.152364 \n", + "131 8670 Malignant 0.378508 0.044296 0.400820 \n", + "233 88206102 Malignant 1.812780 1.982743 1.747740 \n", + ".. ... ... ... ... ... \n", + "360 901034302 Benign -0.450813 -0.283820 -0.516897 \n", + "301 892604 Benign -0.473535 0.139706 -0.475295 \n", + "406 905189 Benign 0.571638 -1.030809 0.507915 \n", + "27 852781 Malignant 1.273153 0.223480 1.241101 \n", + "284 8912284 Benign -0.351408 -0.835335 -0.324951 \n", + "\n", + " area_mean smoothness_mean compactness_mean concavity_mean \\\n", + "164 2.932585 -0.851870 0.192528 0.547405 \n", + "28 0.220449 0.842579 1.238650 0.998129 \n", + "378 -0.211286 -0.973563 -0.546958 -0.581412 \n", + "131 0.267377 0.913744 0.340350 0.725686 \n", + "233 1.888800 -0.339479 0.057973 0.836170 \n", + ".. ... ... ... ... \n", + "360 -0.463558 -1.565660 -1.475202 -1.099882 \n", + "301 -0.522146 -0.843330 -0.055736 -0.257368 \n", + "406 0.412710 -0.100363 -0.366351 -0.424349 \n", + "27 1.248876 -0.139504 0.042812 0.755818 \n", + "284 -0.393308 -1.293808 -0.161864 0.285006 \n", + "\n", + " concave points_mean ... radius_worst texture_worst perimeter_worst \\\n", + "164 1.240713 ... 2.431317 0.414075 2.291686 \n", + "28 0.995412 ... 0.828498 1.796619 1.252161 \n", + "378 -0.624450 ... -0.358085 -0.983124 -0.277044 \n", + "131 0.824140 ... 0.619345 0.052562 0.525386 \n", + "233 0.889399 ... 1.698245 1.905725 1.651291 \n", + ".. ... ... ... ... ... \n", + "360 -1.121268 ... -0.527893 -0.764914 -0.608859 \n", + "301 -0.462464 ... -0.581734 -0.424570 -0.569839 \n", + "406 -0.093868 ... 0.298367 -0.992895 0.257314 \n", + "27 0.732313 ... 1.043864 0.257745 0.972174 \n", + "284 -0.387404 ... -0.490618 -0.974982 -0.450994 \n", + "\n", + " area_worst smoothness_worst compactness_worst concavity_worst \\\n", + "164 2.676276 -0.419448 0.661808 0.588232 \n", + "28 0.682803 1.390974 2.269333 1.733401 \n", + "378 -0.393040 -0.213419 0.357097 -0.073347 \n", + "131 0.484159 0.974533 -0.094562 0.512911 \n", + "233 1.742824 -0.441366 0.138901 0.683223 \n", + ".. ... ... ... ... \n", + "360 -0.518379 -1.728826 -1.342223 -1.288651 \n", + "301 -0.578851 -1.199727 -0.244691 -0.392382 \n", + "406 0.118337 -0.515887 -0.522048 -0.197603 \n", + "27 0.918363 0.062747 -0.270773 0.347396 \n", + "284 -0.500975 -1.451345 -0.143545 0.298461 \n", + "\n", + " concave points_worst symmetry_worst fractal_dimension_worst \n", + "164 1.827097 1.113428 0.439125 \n", + "28 1.336800 1.822016 0.820940 \n", + "378 -0.140179 0.786637 0.689050 \n", + "131 0.560244 -0.103143 -0.208132 \n", + "233 0.634854 -0.750255 -0.036897 \n", + ".. ... ... ... \n", + "360 -1.496108 -1.080282 -1.592419 \n", + "301 -0.584035 -0.349045 -0.349442 \n", + "406 -0.025980 -0.198592 -0.766169 \n", + "27 0.523700 -0.905562 -0.539518 \n", + "284 -0.196518 -1.458843 -0.702441 \n", + "\n", + "[426 rows x 32 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set the random seed\n", + "np.random.seed(1)\n", + "\n", + "# Split the data into a training and testing set. \n", + "# Stratify to ensures that the class distribution (benign vs malignant) \n", + "cancer_train, cancer_test = train_test_split(\n", + " standardized_cancer, train_size=0.75, shuffle=True, stratify=standardized_cancer['diagnosis']\n", + ")\n", + "\n", + "cancer_train\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2e235ee6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 426 entries, 164 to 284\n", + "Data columns (total 32 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 426 non-null int64 \n", + " 1 diagnosis 426 non-null object \n", + " 2 radius_mean 426 non-null float64\n", + " 3 texture_mean 426 non-null float64\n", + " 4 perimeter_mean 426 non-null float64\n", + " 5 area_mean 426 non-null float64\n", + " 6 smoothness_mean 426 non-null float64\n", + " 7 compactness_mean 426 non-null float64\n", + " 8 concavity_mean 426 non-null float64\n", + " 9 concave points_mean 426 non-null float64\n", + " 10 symmetry_mean 426 non-null float64\n", + " 11 fractal_dimension_mean 426 non-null float64\n", + " 12 radius_se 426 non-null float64\n", + " 13 texture_se 426 non-null float64\n", + " 14 perimeter_se 426 non-null float64\n", + " 15 area_se 426 non-null float64\n", + " 16 smoothness_se 426 non-null float64\n", + " 17 compactness_se 426 non-null float64\n", + " 18 concavity_se 426 non-null float64\n", + " 19 concave points_se 426 non-null float64\n", + " 20 symmetry_se 426 non-null float64\n", + " 21 fractal_dimension_se 426 non-null float64\n", + " 22 radius_worst 426 non-null float64\n", + " 23 texture_worst 426 non-null float64\n", + " 24 perimeter_worst 426 non-null float64\n", + " 25 area_worst 426 non-null float64\n", + " 26 smoothness_worst 426 non-null float64\n", + " 27 compactness_worst 426 non-null float64\n", + " 28 concavity_worst 426 non-null float64\n", + " 29 concave points_worst 426 non-null float64\n", + " 30 symmetry_worst 426 non-null float64\n", + " 31 fractal_dimension_worst 426 non-null float64\n", + "dtypes: float64(30), int64(1), object(1)\n", + "memory usage: 109.8+ KB\n" + ] + } + ], + "source": [ + "cancer_train.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59004d28", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 143 entries, 357 to 332\n", + "Data columns (total 32 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 143 non-null int64 \n", + " 1 diagnosis 143 non-null object \n", + " 2 radius_mean 143 non-null float64\n", + " 3 texture_mean 143 non-null float64\n", + " 4 perimeter_mean 143 non-null float64\n", + " 5 area_mean 143 non-null float64\n", + " 6 smoothness_mean 143 non-null float64\n", + " 7 compactness_mean 143 non-null float64\n", + " 8 concavity_mean 143 non-null float64\n", + " 9 concave points_mean 143 non-null float64\n", + " 10 symmetry_mean 143 non-null float64\n", + " 11 fractal_dimension_mean 143 non-null float64\n", + " 12 radius_se 143 non-null float64\n", + " 13 texture_se 143 non-null float64\n", + " 14 perimeter_se 143 non-null float64\n", + " 15 area_se 143 non-null float64\n", + " 16 smoothness_se 143 non-null float64\n", + " 17 compactness_se 143 non-null float64\n", + " 18 concavity_se 143 non-null float64\n", + " 19 concave points_se 143 non-null float64\n", + " 20 symmetry_se 143 non-null float64\n", + " 21 fractal_dimension_se 143 non-null float64\n", + " 22 radius_worst 143 non-null float64\n", + " 23 texture_worst 143 non-null float64\n", + " 24 perimeter_worst 143 non-null float64\n", + " 25 area_worst 143 non-null float64\n", + " 26 smoothness_worst 143 non-null float64\n", + " 27 compactness_worst 143 non-null float64\n", + " 28 concavity_worst 143 non-null float64\n", + " 29 concave points_worst 143 non-null float64\n", + " 30 symmetry_worst 143 non-null float64\n", + " 31 fractal_dimension_worst 143 non-null float64\n", + "dtypes: float64(30), int64(1), object(1)\n", + "memory usage: 36.9+ KB\n" + ] + } + ], + "source": [ + "cancer_test.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a626e1ca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier()" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Use the KNN algorithm to classify the tumors in the test set.\n", + "knn = KNeighborsClassifier(n_neighbors=5)\n", + "knn\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "12bf28a5", + "metadata": {}, + "outputs": [], + "source": [ + "# Define predictor variable (X) and response variable (y)\n", + "\n", + "X = cancer_train[[\"perimeter_mean\",\"concavity_mean\"]]\n", + "y = cancer_train['diagnosis'] " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e093ff17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier()" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn.fit(X, y)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5d7188f", + "metadata": {}, + "outputs": [], + "source": [ + "# Step 4. Predict on the test data\n", + "# Make predictions on the test set\n", + "cancer_test['predicted'] = knn.predict(cancer_test[[\"perimeter_mean\",\"concavity_mean\"]])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "13766f35", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiagnosispredicted
357901028BenignBenign
361901041BenignBenign
2128810703MalignantMalignant
52791813702BenignBenign
218510824BenignBenign
............
3649010877BenignBenign
434908469BenignBenign
299892399BenignBenign
488913512BenignBenign
332897132BenignBenign
\n", + "

143 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " id diagnosis predicted\n", + "357 901028 Benign Benign\n", + "361 901041 Benign Benign\n", + "212 8810703 Malignant Malignant\n", + "527 91813702 Benign Benign\n", + "21 8510824 Benign Benign\n", + ".. ... ... ...\n", + "364 9010877 Benign Benign\n", + "434 908469 Benign Benign\n", + "299 892399 Benign Benign\n", + "488 913512 Benign Benign\n", + "332 897132 Benign Benign\n", + "\n", + "[143 rows x 3 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Compare the predicted values to the actual values\n", + "cancer_test[['id','diagnosis','predicted']]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "85dc55a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9230769230769231" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Calculate the accuracy of the model\n", + "knn.score(\n", + " cancer_test[[\"perimeter_mean\",\"concavity_mean\"]], cancer_test['diagnosis']\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "59cf4bb6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PredictedBenignMalignant
Actual
Benign882
Malignant944
\n", + "
" + ], + "text/plain": [ + "Predicted Benign Malignant\n", + "Actual \n", + "Benign 88 2\n", + "Malignant 9 44" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cross-validation to evaluate the model\n", + "pd.crosstab(\n", + " cancer_test['diagnosis'],\n", + " cancer_test['predicted'],\n", + " rownames=['Actual'],\n", + " colnames=['Predicted']\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "936aaea1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9565217391304348" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Caluclate precision and recall\n", + "precision_score(\n", + " y_true = cancer_test['diagnosis'],\n", + " y_pred = cancer_test['predicted'],\n", + " pos_label= \"Malignant\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4ea43ebe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8301886792452831" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Calculate recall\n", + "recall_score(\n", + " y_true = cancer_test['diagnosis'],\n", + " y_pred = cancer_test['predicted'],\n", + " pos_label = \"Malignant\"\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "id": "11524e1f", + "metadata": {}, + "outputs": [], + "source": [ + "# Split the data into train and valudation splits\n", + "\n", + "np.random.seed(1)\n", + "cancer_subtrain, cancer_validation = train_test_split(\n", + " cancer_train, train_size = 0.75,shuffle=True, stratify=cancer_train['diagnosis']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "007e01cc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier(n_neighbors=4)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier(n_neighbors=4)" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fit the model on the training data\n", + "# Step 1. Initialize the model\n", + "knn = KNeighborsClassifier(n_neighbors=4)\n", + "\n", + "#Step 2. Define the model X and y\n", + "X = cancer_subtrain[[\"perimeter_mean\",\"concavity_mean\"]]\n", + "y = cancer_subtrain['diagnosis']\n", + "\n", + "#Step 3. Fit the model to our data\n", + "knn.fit(X,y)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "d9a73931", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.875" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# Step 4. Evaluate the model on the validation set\n", + "acc = knn.score(\n", + " cancer_validation[[\"perimeter_mean\",\"concavity_mean\"]],\n", + " cancer_validation['diagnosis']\n", + ")\n", + "acc" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "id": "aecad8a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fit_timescore_timetest_score
00.0023250.0076120.906250
10.0022930.0077310.937500
20.0019850.0115540.859375
30.0027310.0091200.843750
40.0032330.0095780.936508
\n", + "
" + ], + "text/plain": [ + " fit_time score_time test_score\n", + "0 0.002325 0.007612 0.906250\n", + "1 0.002293 0.007731 0.937500\n", + "2 0.001985 0.011554 0.859375\n", + "3 0.002731 0.009120 0.843750\n", + "4 0.003233 0.009578 0.936508" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Step 5. Cross-validate the model on the validation set\n", + "\n", + "knn = KNeighborsClassifier(n_neighbors=3)\n", + "X = cancer_train[[\"perimeter_mean\",\"concavity_mean\"]] #Predictor variables\n", + "y = cancer_train['diagnosis'] #Response variable \n", + "\n", + "# Cross-validate the model\n", + "returned_dictionary = cross_validate(\n", + " estimator= knn, #The model to evaluate\n", + " cv = 5, #Number of folds in the cross-validation \n", + " X = X, #Predictor variables \n", + " y = y #Response variable\n", + ")\n", + "\n", + "cv_5_df = pd.DataFrame(returned_dictionary)\n", + "cv_5_df\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "725902ae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fit_timescore_timetest_score
mean0.0025130.0091190.896677
sem0.0002150.0007190.019413
\n", + "
" + ], + "text/plain": [ + " fit_time score_time test_score\n", + "mean 0.002513 0.009119 0.896677\n", + "sem 0.000215 0.000719 0.019413" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Display the agg mean and sem\n", + "cv_5_metrics = cv_5_df.agg(['mean','sem'])\n", + "cv_5_metrics\n" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "0cc58e75", + "metadata": {}, + "outputs": [], + "source": [ + "# Step 6. Hyperparameter tuning with GridSearchCV, start with k = 1 to 385 and step by 5\n", + "parameter_grid = {\n", + " \"n_neighbors\" : range(1,255,5) \n", + "}\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "c09c28e9", + "metadata": {}, + "outputs": [], + "source": [ + "# Use GridSearchCV to search for the best hyperparameter with 10-fold cross-validation\n", + "cancer_tune_grid = GridSearchCV(\n", + " estimator = knn, \n", + " param_grid = parameter_grid, \n", + " cv = 5 \n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "8c171e02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5, estimator=KNeighborsClassifier(n_neighbors=31),\n",
+       "             param_grid={'n_neighbors': range(1, 255, 5)})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5, estimator=KNeighborsClassifier(n_neighbors=31),\n", + " param_grid={'n_neighbors': range(1, 255, 5)})" + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fit the model to the training data\n", + "cancer_tune_grid.fit(\n", + " cancer_train[[\"perimeter_mean\",\"concavity_mean\"]],\n", + " cancer_train['diagnosis']\n", + ") \n" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "bb1aef37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_fit_timestd_fit_timemean_score_timestd_score_timeparam_n_neighborsparamssplit0_test_scoresplit1_test_scoresplit2_test_scoresplit3_test_scoresplit4_test_scoremean_test_scorestd_test_scorerank_test_score
00.0026090.0002460.0099800.0015161{'n_neighbors': 1}0.8437500.9062500.8437500.8437500.9206350.8716270.03444433
10.0115440.0164870.0234470.0267406{'n_neighbors': 6}0.8906250.9375000.9218750.8906250.9841270.9249500.03471410
20.0057110.0046100.0144180.01075611{'n_neighbors': 11}0.8906250.9531250.9062500.9062500.9682540.9249010.03015511
30.0023040.0002510.0074190.00080016{'n_neighbors': 16}0.8750000.9531250.9062500.8750000.9682540.9155260.03889614
40.0021860.0001420.0070260.00047621{'n_neighbors': 21}0.9062500.9531250.9062500.9062500.9841270.9312000.0320923
50.0021800.0002110.0070340.00033026{'n_neighbors': 26}0.8906250.9375000.9218750.9062500.9841270.9280750.0320875
60.0022340.0001030.0071280.00040231{'n_neighbors': 31}0.9062500.9531250.9218750.9062500.9841270.9343250.0302161
70.0022000.0002260.0068860.00055336{'n_neighbors': 36}0.9062500.9531250.9218750.9218750.9682540.9342760.0228162
80.0024000.0001930.0068150.00058641{'n_neighbors': 41}0.8906250.9531250.9218750.9218750.9523810.9279760.0232286
90.0021170.0002800.0070150.00047146{'n_neighbors': 46}0.9062500.9531250.9218750.9218750.9523810.9311010.0185784
100.0021490.0002430.0072120.00060651{'n_neighbors': 51}0.8906250.9531250.9218750.9218750.9365080.9248020.02061312
110.0020710.0001290.0071280.00050156{'n_neighbors': 56}0.8906250.9531250.9218750.9218750.9523810.9279760.0232286
120.0024410.0002470.0066980.00032661{'n_neighbors': 61}0.8906250.9531250.9218750.9218750.9523810.9279760.0232286
130.0021480.0001880.0071670.00044566{'n_neighbors': 66}0.8906250.9531250.9218750.9218750.9523810.9279760.0232286
140.0020490.0001200.0074030.00066271{'n_neighbors': 71}0.8750000.9531250.8906250.9218750.9523810.9186010.03170913
150.0023540.0002460.0071940.00041476{'n_neighbors': 76}0.8750000.9531250.8750000.9218750.9523810.9154760.03492015
160.0021920.0003170.0073250.00060581{'n_neighbors': 81}0.8750000.9531250.8750000.9218750.9523810.9154760.03492015
170.0022200.0002170.0072120.00049586{'n_neighbors': 86}0.8750000.9531250.8750000.9218750.9523810.9154760.03492015
180.0021650.0001540.0077080.00091191{'n_neighbors': 91}0.8750000.9531250.8750000.9218750.9523810.9154760.03492015
190.0021150.0001730.0075180.00069396{'n_neighbors': 96}0.8750000.9531250.8750000.9218750.9523810.9154760.03492015
200.0022820.0002600.0076480.000725101{'n_neighbors': 101}0.8593750.9531250.8750000.9218750.9523810.9123510.03887720
210.0025740.0004400.0075480.000593106{'n_neighbors': 106}0.8593750.9218750.8750000.9218750.9523810.9061010.03403021
220.0024150.0001980.0073260.000485111{'n_neighbors': 111}0.8593750.9218750.8750000.9062500.9523810.9029760.03314322
230.0021090.0002400.0074050.000745116{'n_neighbors': 116}0.8593750.9062500.8750000.8906250.9523810.8967260.03191423
240.0022040.0002040.0074080.000670121{'n_neighbors': 121}0.8593750.9062500.8750000.8750000.9523810.8936010.03310125
250.0021390.0002320.0073790.000371126{'n_neighbors': 126}0.8437500.9062500.8750000.8750000.9682540.8936510.04221424
260.0021620.0003270.0083200.000892131{'n_neighbors': 131}0.8281250.8906250.8750000.8906250.9682540.8905260.04511526
270.0020560.0001930.0089320.000954136{'n_neighbors': 136}0.8437500.8906250.8750000.8593750.9682540.8874010.04334127
280.0021510.0004290.0080310.000780141{'n_neighbors': 141}0.8437500.8906250.8750000.8593750.9682540.8874010.04334127
290.0020660.0001400.0087440.000615146{'n_neighbors': 146}0.8437500.8906250.8750000.8593750.9682540.8874010.04334127
300.0022830.0004850.0083310.000218151{'n_neighbors': 151}0.8437500.8906250.8750000.8593750.9682540.8874010.04334127
310.0021210.0003210.0081940.000715156{'n_neighbors': 156}0.8437500.8906250.8437500.8281250.9682540.8749010.05116831
320.0020660.0003410.0084140.000868161{'n_neighbors': 161}0.8437500.8750000.8437500.8281250.9682540.8717760.05058632
330.0024740.0005170.0089150.000519166{'n_neighbors': 166}0.8125000.7968750.8125000.8125000.9206350.8310020.04522334
340.0021390.0002870.0083560.000553171{'n_neighbors': 171}0.8125000.7812500.8125000.7656250.9206350.8185020.05419835
350.0022280.0001890.0087630.000891176{'n_neighbors': 176}0.7656250.7187500.7187500.7656250.9047620.7747020.06832536
360.0021520.0002880.0084770.000637181{'n_neighbors': 181}0.7500000.7031250.7187500.7500000.8888890.7621530.06591737
370.0021220.0002530.0091120.000844186{'n_neighbors': 186}0.7031250.6718750.6875000.7343750.7460320.7085810.02789038
380.0022020.0003790.0091790.001082191{'n_neighbors': 191}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
390.0021450.0002380.0089000.000476196{'n_neighbors': 196}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
400.0022110.0003530.0094860.000353201{'n_neighbors': 201}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
410.0020420.0001220.0086380.000671206{'n_neighbors': 206}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
420.0025730.0004440.0090050.000865211{'n_neighbors': 211}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
430.0023300.0005110.0084350.000403216{'n_neighbors': 216}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
440.0022610.0002370.0093810.000628221{'n_neighbors': 221}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
450.0021270.0002630.0085740.000681226{'n_neighbors': 226}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
460.0023870.0004370.0091610.000595231{'n_neighbors': 231}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
470.0020750.0002870.0091610.000459236{'n_neighbors': 236}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
480.0021470.0003930.0095070.000920241{'n_neighbors': 241}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
490.0022780.0004600.0096160.000783246{'n_neighbors': 246}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
500.0021020.0002340.0100300.000446251{'n_neighbors': 251}0.6250000.6250000.6250000.6250000.6349210.6269840.00396839
\n", + "
" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", + "0 0.002609 0.000246 0.009980 0.001516 \n", + "1 0.011544 0.016487 0.023447 0.026740 \n", + "2 0.005711 0.004610 0.014418 0.010756 \n", + "3 0.002304 0.000251 0.007419 0.000800 \n", + "4 0.002186 0.000142 0.007026 0.000476 \n", + "5 0.002180 0.000211 0.007034 0.000330 \n", + "6 0.002234 0.000103 0.007128 0.000402 \n", + "7 0.002200 0.000226 0.006886 0.000553 \n", + "8 0.002400 0.000193 0.006815 0.000586 \n", + "9 0.002117 0.000280 0.007015 0.000471 \n", + "10 0.002149 0.000243 0.007212 0.000606 \n", + "11 0.002071 0.000129 0.007128 0.000501 \n", + "12 0.002441 0.000247 0.006698 0.000326 \n", + "13 0.002148 0.000188 0.007167 0.000445 \n", + "14 0.002049 0.000120 0.007403 0.000662 \n", + "15 0.002354 0.000246 0.007194 0.000414 \n", + "16 0.002192 0.000317 0.007325 0.000605 \n", + "17 0.002220 0.000217 0.007212 0.000495 \n", + "18 0.002165 0.000154 0.007708 0.000911 \n", + "19 0.002115 0.000173 0.007518 0.000693 \n", + "20 0.002282 0.000260 0.007648 0.000725 \n", + "21 0.002574 0.000440 0.007548 0.000593 \n", + "22 0.002415 0.000198 0.007326 0.000485 \n", + "23 0.002109 0.000240 0.007405 0.000745 \n", + "24 0.002204 0.000204 0.007408 0.000670 \n", + "25 0.002139 0.000232 0.007379 0.000371 \n", + "26 0.002162 0.000327 0.008320 0.000892 \n", + "27 0.002056 0.000193 0.008932 0.000954 \n", + "28 0.002151 0.000429 0.008031 0.000780 \n", + "29 0.002066 0.000140 0.008744 0.000615 \n", + "30 0.002283 0.000485 0.008331 0.000218 \n", + "31 0.002121 0.000321 0.008194 0.000715 \n", + "32 0.002066 0.000341 0.008414 0.000868 \n", + "33 0.002474 0.000517 0.008915 0.000519 \n", + "34 0.002139 0.000287 0.008356 0.000553 \n", + "35 0.002228 0.000189 0.008763 0.000891 \n", + "36 0.002152 0.000288 0.008477 0.000637 \n", + "37 0.002122 0.000253 0.009112 0.000844 \n", + "38 0.002202 0.000379 0.009179 0.001082 \n", + "39 0.002145 0.000238 0.008900 0.000476 \n", + "40 0.002211 0.000353 0.009486 0.000353 \n", + "41 0.002042 0.000122 0.008638 0.000671 \n", + "42 0.002573 0.000444 0.009005 0.000865 \n", + "43 0.002330 0.000511 0.008435 0.000403 \n", + "44 0.002261 0.000237 0.009381 0.000628 \n", + "45 0.002127 0.000263 0.008574 0.000681 \n", + "46 0.002387 0.000437 0.009161 0.000595 \n", + "47 0.002075 0.000287 0.009161 0.000459 \n", + "48 0.002147 0.000393 0.009507 0.000920 \n", + "49 0.002278 0.000460 0.009616 0.000783 \n", + "50 0.002102 0.000234 0.010030 0.000446 \n", + "\n", + " param_n_neighbors params split0_test_score \\\n", + "0 1 {'n_neighbors': 1} 0.843750 \n", + "1 6 {'n_neighbors': 6} 0.890625 \n", + "2 11 {'n_neighbors': 11} 0.890625 \n", + "3 16 {'n_neighbors': 16} 0.875000 \n", + "4 21 {'n_neighbors': 21} 0.906250 \n", + "5 26 {'n_neighbors': 26} 0.890625 \n", + "6 31 {'n_neighbors': 31} 0.906250 \n", + "7 36 {'n_neighbors': 36} 0.906250 \n", + "8 41 {'n_neighbors': 41} 0.890625 \n", + "9 46 {'n_neighbors': 46} 0.906250 \n", + "10 51 {'n_neighbors': 51} 0.890625 \n", + "11 56 {'n_neighbors': 56} 0.890625 \n", + "12 61 {'n_neighbors': 61} 0.890625 \n", + "13 66 {'n_neighbors': 66} 0.890625 \n", + "14 71 {'n_neighbors': 71} 0.875000 \n", + "15 76 {'n_neighbors': 76} 0.875000 \n", + "16 81 {'n_neighbors': 81} 0.875000 \n", + "17 86 {'n_neighbors': 86} 0.875000 \n", + "18 91 {'n_neighbors': 91} 0.875000 \n", + "19 96 {'n_neighbors': 96} 0.875000 \n", + "20 101 {'n_neighbors': 101} 0.859375 \n", + "21 106 {'n_neighbors': 106} 0.859375 \n", + "22 111 {'n_neighbors': 111} 0.859375 \n", + "23 116 {'n_neighbors': 116} 0.859375 \n", + "24 121 {'n_neighbors': 121} 0.859375 \n", + "25 126 {'n_neighbors': 126} 0.843750 \n", + "26 131 {'n_neighbors': 131} 0.828125 \n", + "27 136 {'n_neighbors': 136} 0.843750 \n", + "28 141 {'n_neighbors': 141} 0.843750 \n", + "29 146 {'n_neighbors': 146} 0.843750 \n", + "30 151 {'n_neighbors': 151} 0.843750 \n", + "31 156 {'n_neighbors': 156} 0.843750 \n", + "32 161 {'n_neighbors': 161} 0.843750 \n", + "33 166 {'n_neighbors': 166} 0.812500 \n", + "34 171 {'n_neighbors': 171} 0.812500 \n", + "35 176 {'n_neighbors': 176} 0.765625 \n", + "36 181 {'n_neighbors': 181} 0.750000 \n", + "37 186 {'n_neighbors': 186} 0.703125 \n", + "38 191 {'n_neighbors': 191} 0.625000 \n", + "39 196 {'n_neighbors': 196} 0.625000 \n", + "40 201 {'n_neighbors': 201} 0.625000 \n", + "41 206 {'n_neighbors': 206} 0.625000 \n", + "42 211 {'n_neighbors': 211} 0.625000 \n", + "43 216 {'n_neighbors': 216} 0.625000 \n", + "44 221 {'n_neighbors': 221} 0.625000 \n", + "45 226 {'n_neighbors': 226} 0.625000 \n", + "46 231 {'n_neighbors': 231} 0.625000 \n", + "47 236 {'n_neighbors': 236} 0.625000 \n", + "48 241 {'n_neighbors': 241} 0.625000 \n", + "49 246 {'n_neighbors': 246} 0.625000 \n", + "50 251 {'n_neighbors': 251} 0.625000 \n", + "\n", + " split1_test_score split2_test_score split3_test_score \\\n", + "0 0.906250 0.843750 0.843750 \n", + "1 0.937500 0.921875 0.890625 \n", + "2 0.953125 0.906250 0.906250 \n", + "3 0.953125 0.906250 0.875000 \n", + "4 0.953125 0.906250 0.906250 \n", + "5 0.937500 0.921875 0.906250 \n", + "6 0.953125 0.921875 0.906250 \n", + "7 0.953125 0.921875 0.921875 \n", + "8 0.953125 0.921875 0.921875 \n", + "9 0.953125 0.921875 0.921875 \n", + "10 0.953125 0.921875 0.921875 \n", + "11 0.953125 0.921875 0.921875 \n", + "12 0.953125 0.921875 0.921875 \n", + "13 0.953125 0.921875 0.921875 \n", + "14 0.953125 0.890625 0.921875 \n", + "15 0.953125 0.875000 0.921875 \n", + "16 0.953125 0.875000 0.921875 \n", + "17 0.953125 0.875000 0.921875 \n", + "18 0.953125 0.875000 0.921875 \n", + "19 0.953125 0.875000 0.921875 \n", + "20 0.953125 0.875000 0.921875 \n", + "21 0.921875 0.875000 0.921875 \n", + "22 0.921875 0.875000 0.906250 \n", + "23 0.906250 0.875000 0.890625 \n", + "24 0.906250 0.875000 0.875000 \n", + "25 0.906250 0.875000 0.875000 \n", + "26 0.890625 0.875000 0.890625 \n", + "27 0.890625 0.875000 0.859375 \n", + "28 0.890625 0.875000 0.859375 \n", + "29 0.890625 0.875000 0.859375 \n", + "30 0.890625 0.875000 0.859375 \n", + "31 0.890625 0.843750 0.828125 \n", + "32 0.875000 0.843750 0.828125 \n", + "33 0.796875 0.812500 0.812500 \n", + "34 0.781250 0.812500 0.765625 \n", + "35 0.718750 0.718750 0.765625 \n", + "36 0.703125 0.718750 0.750000 \n", + "37 0.671875 0.687500 0.734375 \n", + "38 0.625000 0.625000 0.625000 \n", + "39 0.625000 0.625000 0.625000 \n", + "40 0.625000 0.625000 0.625000 \n", + "41 0.625000 0.625000 0.625000 \n", + "42 0.625000 0.625000 0.625000 \n", + "43 0.625000 0.625000 0.625000 \n", + "44 0.625000 0.625000 0.625000 \n", + "45 0.625000 0.625000 0.625000 \n", + "46 0.625000 0.625000 0.625000 \n", + "47 0.625000 0.625000 0.625000 \n", + "48 0.625000 0.625000 0.625000 \n", + "49 0.625000 0.625000 0.625000 \n", + "50 0.625000 0.625000 0.625000 \n", + "\n", + " split4_test_score mean_test_score std_test_score rank_test_score \n", + "0 0.920635 0.871627 0.034444 33 \n", + "1 0.984127 0.924950 0.034714 10 \n", + "2 0.968254 0.924901 0.030155 11 \n", + "3 0.968254 0.915526 0.038896 14 \n", + "4 0.984127 0.931200 0.032092 3 \n", + "5 0.984127 0.928075 0.032087 5 \n", + "6 0.984127 0.934325 0.030216 1 \n", + "7 0.968254 0.934276 0.022816 2 \n", + "8 0.952381 0.927976 0.023228 6 \n", + "9 0.952381 0.931101 0.018578 4 \n", + "10 0.936508 0.924802 0.020613 12 \n", + "11 0.952381 0.927976 0.023228 6 \n", + "12 0.952381 0.927976 0.023228 6 \n", + "13 0.952381 0.927976 0.023228 6 \n", + "14 0.952381 0.918601 0.031709 13 \n", + "15 0.952381 0.915476 0.034920 15 \n", + "16 0.952381 0.915476 0.034920 15 \n", + "17 0.952381 0.915476 0.034920 15 \n", + "18 0.952381 0.915476 0.034920 15 \n", + "19 0.952381 0.915476 0.034920 15 \n", + "20 0.952381 0.912351 0.038877 20 \n", + "21 0.952381 0.906101 0.034030 21 \n", + "22 0.952381 0.902976 0.033143 22 \n", + "23 0.952381 0.896726 0.031914 23 \n", + "24 0.952381 0.893601 0.033101 25 \n", + "25 0.968254 0.893651 0.042214 24 \n", + "26 0.968254 0.890526 0.045115 26 \n", + "27 0.968254 0.887401 0.043341 27 \n", + "28 0.968254 0.887401 0.043341 27 \n", + "29 0.968254 0.887401 0.043341 27 \n", + "30 0.968254 0.887401 0.043341 27 \n", + "31 0.968254 0.874901 0.051168 31 \n", + "32 0.968254 0.871776 0.050586 32 \n", + "33 0.920635 0.831002 0.045223 34 \n", + "34 0.920635 0.818502 0.054198 35 \n", + "35 0.904762 0.774702 0.068325 36 \n", + "36 0.888889 0.762153 0.065917 37 \n", + "37 0.746032 0.708581 0.027890 38 \n", + "38 0.634921 0.626984 0.003968 39 \n", + "39 0.634921 0.626984 0.003968 39 \n", + "40 0.634921 0.626984 0.003968 39 \n", + "41 0.634921 0.626984 0.003968 39 \n", + "42 0.634921 0.626984 0.003968 39 \n", + "43 0.634921 0.626984 0.003968 39 \n", + "44 0.634921 0.626984 0.003968 39 \n", + "45 0.634921 0.626984 0.003968 39 \n", + "46 0.634921 0.626984 0.003968 39 \n", + "47 0.634921 0.626984 0.003968 39 \n", + "48 0.634921 0.626984 0.003968 39 \n", + "49 0.634921 0.626984 0.003968 39 \n", + "50 0.634921 0.626984 0.003968 39 " + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracy_grid = pd.DataFrame(cancer_tune_grid.cv_results_)\n", + "accuracy_grid" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "dcf6b287", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create the plot\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "# Plot mean test scores with error bars\n", + "plt.plot(accuracy_grid['param_n_neighbors'], accuracy_grid['mean_test_score'], '-o', color='blue')\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Number of Neighbors')\n", + "plt.ylabel('Accuracy estimate')\n", + "plt.title('K-Nearest Neighbors Performance')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "a3f91076", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_neighbors': 31}" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the best hyperparameter\n", + "cancer_tune_grid.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "e4446fcc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier(n_neighbors=31)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier(n_neighbors=31)" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn = KNeighborsClassifier(n_neighbors= 31)\n", + "X = cancer_train[[\"perimeter_mean\",\"concavity_mean\"]]\n", + "y = cancer_train['diagnosis']\n", + "\n", + "knn.fit(X,y)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "3a604b18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9370629370629371" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn.score(\n", + " cancer_test[[\"perimeter_mean\",\"concavity_mean\"]],\n", + " cancer_test['diagnosis']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc338bb6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dsi_participant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}