From 6899220e41bea2c2712be4d06fc46c242e55b24b Mon Sep 17 00:00:00 2001 From: Fay Kisteroff Date: Sat, 6 Sep 2025 18:01:21 -0400 Subject: [PATCH 1/2] assignment 1 LCR --- 02_activities/assignments/assignment_1.ipynb | 1376 +++++++++++++++++- 1 file changed, 1340 insertions(+), 36 deletions(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index e50cc66eb..19beca820 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 108, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +56,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 109, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +369,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 110, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "178\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "rows, cols = wine_df.shape\n", + "\n", + "print(rows)" ] }, { @@ -109,12 +399,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 111, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "print(cols)" ] }, { @@ -127,12 +427,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 112, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('int64')" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "wine_df['class'].dtype" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "id": "c5d5d5cc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2])" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wine_df['class'].unique()" ] }, { @@ -146,12 +480,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 114, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "13" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "cols - 1" ] }, { @@ -175,10 +522,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 115, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +578,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "Since KNN is dependent on distance calculation, disproportionale scales in the predictor variables can skew the prediction and results in a way that paints an inaccurate narrative of what the dataset actually looks like. If one predictor variable is based on a larger scale numerically, then the model will be much more sensitive to changes in this variable. Standardization gets rid of this inbalance and puts all predictor variables on the same scale." ] }, { @@ -217,10 +591,10 @@ }, { "cell_type": "markdown", - "id": "fdee5a15", + "id": "105b01c2", "metadata": {}, "source": [ - "> Your answer here..." + "We don't want to standardize the response variable because this is what we are hoping to train the model to predict, and the response variable is not included in the distance calculation for the KNN model. " ] }, { @@ -236,7 +610,9 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "Setting a seed is important because when we split the dataset into the training and test variables, we do so randomly. If we don't set a seed, then the values will keep changing everytime we try to run the model, which makes our predictions inconsistent. It also makes it much more difficult to reproduce positive results, or for others to use our model. When we set a seed, the randomness of the data splitting (into test/training) is predictable and executes the same split everytime.\n", + "\n", + "The value of the seed used is arbitrary and is used more so as a placemarker for the random() function so be able to randomize the test/train dataset the same way each time." ] }, { @@ -251,7 +627,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 116, "id": "72c101f2", "metadata": {}, "outputs": [], @@ -261,7 +637,14 @@ "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", "\n", - "# Your code here ..." + "X_train, X_test, y_train, y_test = train_test_split(\n", + " predictors_standardized,\n", + " wine_df['class'],\n", + " train_size = 0.75,\n", + " shuffle = True,\n", + " stratify = wine_df['class'],\n", + " random_state=123\n", + ")" ] }, { @@ -284,12 +667,474 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "08818c64", + "execution_count": 117, + "id": "7a831a4e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n",
+       "             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n",
+       "                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,\n",
+       "                                         23, 24, 25, 26, 27, 28, 29, 30, ...]})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n", + " param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n", + " 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,\n", + " 23, 24, 25, 26, 27, 28, 29, 30, ...]})" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn = KNeighborsClassifier()\n", + "param_grid = {'n_neighbors': list(range(1,51))}\n", + "\n", + "grid_search = GridSearchCV(knn, param_grid=param_grid, cv=10)\n", + "grid_search.fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "0527c7d8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7\n" + ] + } + ], "source": [ - "# Your code here..." + "best_k = grid_search.best_params_['n_neighbors']\n", + "\n", + "print(best_k)" ] }, { @@ -305,12 +1150,476 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 119, "id": "ffefa9f2", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier(n_neighbors=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier(n_neighbors=7)" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn = KNeighborsClassifier(n_neighbors=best_k)\n", + "\n", + "knn.fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "1f561795", + "metadata": {}, "outputs": [], "source": [ - "# Your code here..." + "pred_set = knn.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "e422575b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9333333333333333" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracy = accuracy_score(y_test, pred_set)\n", + "\n", + "accuracy" ] }, { @@ -365,7 +1674,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "dsi_participant", "language": "python", "name": "python3" }, @@ -379,12 +1688,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.9.18" } }, "nbformat": 4, From ddc02cef5aaedf949cb16d44e80958a8f3cc01be Mon Sep 17 00:00:00 2001 From: Fay Kisteroff Date: Sat, 6 Sep 2025 18:17:14 -0400 Subject: [PATCH 2/2] assignment 1 LCR --- 02_activities/assignments/assignment_1.ipynb | 223 ++++++++++--------- 1 file changed, 112 insertions(+), 111 deletions(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 19beca820..b175115e4 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 2, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 3, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, "outputs": [ @@ -333,7 +333,7 @@ "[178 rows x 14 columns]" ] }, - "execution_count": 109, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -369,7 +369,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 4, "id": "56916892", "metadata": {}, "outputs": [ @@ -399,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 5, "id": "df0ef103", "metadata": {}, "outputs": [ @@ -427,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 6, "id": "47989426", "metadata": {}, "outputs": [ @@ -437,7 +437,7 @@ "dtype('int64')" ] }, - "execution_count": 112, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -450,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 7, "id": "c5d5d5cc", "metadata": {}, "outputs": [ @@ -460,7 +460,7 @@ "array([0, 1, 2])" ] }, - "execution_count": 113, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -480,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 8, "id": "bd7b0910", "metadata": {}, "outputs": [ @@ -490,7 +490,7 @@ "13" ] }, - "execution_count": 114, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -522,7 +522,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 9, "id": "cc899b59", "metadata": {}, "outputs": [ @@ -627,7 +627,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 10, "id": "72c101f2", "metadata": {}, "outputs": [], @@ -637,6 +637,7 @@ "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", "\n", + "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " predictors_standardized,\n", " wine_df['class'],\n", @@ -667,14 +668,14 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 11, "id": "7a831a4e", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n",
+       "
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n",
        "             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n",
        "                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,\n",
-       "                                         23, 24, 25, 26, 27, 28, 29, 30, ...]})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=7)
KNeighborsClassifier(n_neighbors=7)
" ], "text/plain": [ "GridSearchCV(cv=10, estimator=KNeighborsClassifier(),\n", @@ -1104,7 +1105,7 @@ " 23, 24, 25, 26, 27, 28, 29, 30, ...]})" ] }, - "execution_count": 117, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1119,7 +1120,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 12, "id": "0527c7d8", "metadata": {}, "outputs": [ @@ -1150,14 +1151,14 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 13, "id": "ffefa9f2", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
KNeighborsClassifier(n_neighbors=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
KNeighborsClassifier(n_neighbors=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "KNeighborsClassifier(n_neighbors=7)" ] }, - "execution_count": 119, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1591,7 +1592,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 14, "id": "1f561795", "metadata": {}, "outputs": [], @@ -1601,7 +1602,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 15, "id": "e422575b", "metadata": {}, "outputs": [ @@ -1611,7 +1612,7 @@ "0.9333333333333333" ] }, - "execution_count": 121, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" }