From c2aa803dc8be0d73f44db42da84421c357dab32c Mon Sep 17 00:00:00 2001 From: Ziyi Dai Date: Sun, 23 Nov 2025 21:21:44 -0500 Subject: [PATCH 1/2] assignment_1 --- 02_activities/assignments/assignment_1.ipynb | 461 +++++++++++++++++-- 1 file changed, 430 insertions(+), 31 deletions(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 28d4df017..ab8ea602e 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +56,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +369,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 178 entries, 0 to 177\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 178 non-null float64\n", + " 1 malic_acid 178 non-null float64\n", + " 2 ash 178 non-null float64\n", + " 3 alcalinity_of_ash 178 non-null float64\n", + " 4 magnesium 178 non-null float64\n", + " 5 total_phenols 178 non-null float64\n", + " 6 flavanoids 178 non-null float64\n", + " 7 nonflavanoid_phenols 178 non-null float64\n", + " 8 proanthocyanins 178 non-null float64\n", + " 9 color_intensity 178 non-null float64\n", + " 10 hue 178 non-null float64\n", + " 11 od280/od315_of_diluted_wines 178 non-null float64\n", + " 12 proline 178 non-null float64\n", + " 13 class 178 non-null int64 \n", + "dtypes: float64(13), int64(1)\n", + "memory usage: 19.6 KB\n" + ] + } + ], "source": [ - "# Your answer here" + "wine_df.info()\n", + "#The dataset contains 178 rows." ] }, { @@ -114,7 +421,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# The dataset contains 14 columns. " ] }, { @@ -127,12 +434,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# The class variable is integer. \n", + "wine_df['class'].unique()\n", + "#There are three unique levels, 0, 1 and 2. " ] }, { @@ -151,7 +471,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# 13 variables" ] }, { @@ -175,10 +495,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +551,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "> It is important because variable standardization ensures fair contribution from all variables (all variable will have a mean of 0 and sd of 1), so that the scale of the variables will not affect the model or prediction process." ] }, { @@ -220,7 +567,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "> Because Class is a response variable, not a predictor variables. Class is what we are trying to predict and it's categorical, therefore does not need standardization. " ] }, { @@ -236,7 +583,8 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "> Setting a seed is important because it allows reproducibility. If we don't set a seed, then each run will be different, for example splitting data will give us different training and testing set every time. The particular seed value is not important because as long as we keep the same seed value, the results will be reproducible.\n", + "np.random.seed(123)" ] }, { @@ -260,8 +608,7 @@ "np.random.seed(123)\n", "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", - "\n", - "# Your code here ..." + "wine_x_train,wine_x_test,wine_y_train,wine_y_test = train_test_split (predictors, wine_df['class'], train_size=0.75,shuffle= True, stratify = wine_df ['class'] )" ] }, { @@ -284,12 +631,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_neighbors': 20}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "np.random.seed(123)\n", + "\n", + "knn= KNeighborsClassifier(n_neighbors= 5) \n", + "\n", + "parameter_grid= {\n", + " \"n_neighbors\" : range(1,51)\n", + "}\n", + "\n", + "grid_search = GridSearchCV (\n", + " estimator=knn, \n", + " param_grid= parameter_grid,\n", + " cv= 10\n", + ")\n", + "\n", + "grid_search.fit (\n", + " wine_x_train,\n", + " wine_y_train\n", + ")\n", + "\n", + "accuracy_grid = pd.DataFrame (grid_search.cv_results_)\n", + "accuracy_grid\n", + "\n", + "grid_search.best_params_\n", + "#{'n_neighbors': 20}" ] }, { @@ -305,12 +686,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.7333333333333333" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "np.random.seed(123)\n", + "\n", + "knn_best= KNeighborsClassifier(n_neighbors= 20)\n", + "\n", + "knn_best.fit(\n", + " wine_x_train, wine_y_train\n", + ")\n", + "\n", + "wine_y_predict = knn_best.predict(wine_x_test)\n", + "\n", + "accuracy_score (wine_y_test, wine_y_predict)\n", + "\n", + "#0.7333333333333333\n" ] }, { @@ -365,7 +769,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "lcr-env", "language": "python", "name": "python3" }, @@ -379,12 +783,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.11.13" } }, "nbformat": 4, From 16621ddc4762adeda58a9d1a0dd3829b5b55d78b Mon Sep 17 00:00:00 2001 From: Ziyi Dai Date: Thu, 27 Nov 2025 13:07:33 -0500 Subject: [PATCH 2/2] updated_assignment1 --- 02_activities/assignments/assignment_1.ipynb | 88 +++++++------------- 1 file changed, 32 insertions(+), 56 deletions(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index ab8ea602e..b231df14b 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, "outputs": [ @@ -333,7 +333,7 @@ "[178 rows x 14 columns]" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -377,33 +377,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", - "RangeIndex: 178 entries, 0 to 177\n", - "Data columns (total 14 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 alcohol 178 non-null float64\n", - " 1 malic_acid 178 non-null float64\n", - " 2 ash 178 non-null float64\n", - " 3 alcalinity_of_ash 178 non-null float64\n", - " 4 magnesium 178 non-null float64\n", - " 5 total_phenols 178 non-null float64\n", - " 6 flavanoids 178 non-null float64\n", - " 7 nonflavanoid_phenols 178 non-null float64\n", - " 8 proanthocyanins 178 non-null float64\n", - " 9 color_intensity 178 non-null float64\n", - " 10 hue 178 non-null float64\n", - " 11 od280/od315_of_diluted_wines 178 non-null float64\n", - " 12 proline 178 non-null float64\n", - " 13 class 178 non-null int64 \n", - "dtypes: float64(13), int64(1)\n", - "memory usage: 19.6 KB\n" + "178\n" ] } ], "source": [ - "wine_df.info()\n", - "#The dataset contains 178 rows." + "num_obeservations= wine_df.shape[0]\n", + "print(num_obeservations)" ] }, { @@ -416,12 +396,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14\n" + ] + } + ], "source": [ - "# The dataset contains 14 columns. " + "num_obeservations_col= wine_df.shape[1]\n", + "print(num_obeservations_col)" ] }, { @@ -434,7 +423,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "id": "47989426", "metadata": {}, "outputs": [ @@ -444,13 +433,13 @@ "array([0, 1, 2])" ] }, - "execution_count": 13, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# The class variable is integer. \n", + "# The class variable is categorical. \n", "wine_df['class'].unique()\n", "#There are three unique levels, 0, 1 and 2. " ] @@ -495,7 +484,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "cc899b59", "metadata": {}, "outputs": [ @@ -599,7 +588,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "72c101f2", "metadata": {}, "outputs": [], @@ -608,7 +597,7 @@ "np.random.seed(123)\n", "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", - "wine_x_train,wine_x_test,wine_y_train,wine_y_test = train_test_split (predictors, wine_df['class'], train_size=0.75,shuffle= True, stratify = wine_df ['class'] )" + "wine_x_train,wine_x_test,wine_y_train,wine_y_test = train_test_split (predictors_standardized, wine_df['class'], train_size=0.75,shuffle= True, stratify = wine_df ['class'] )" ] }, { @@ -631,21 +620,10 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "08818c64", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'n_neighbors': 20}" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "np.random.seed(123)\n", "\n", @@ -669,8 +647,7 @@ "accuracy_grid = pd.DataFrame (grid_search.cv_results_)\n", "accuracy_grid\n", "\n", - "grid_search.best_params_\n", - "#{'n_neighbors': 20}" + "best_n_neighbors= grid_search.best_params_['n_neighbors']" ] }, { @@ -686,17 +663,17 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "ffefa9f2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.7333333333333333" + "0.9333333333333333" ] }, - "execution_count": 30, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -704,7 +681,7 @@ "source": [ "np.random.seed(123)\n", "\n", - "knn_best= KNeighborsClassifier(n_neighbors= 20)\n", + "knn_best= KNeighborsClassifier(n_neighbors= best_n_neighbors)\n", "\n", "knn_best.fit(\n", " wine_x_train, wine_y_train\n", @@ -713,8 +690,7 @@ "wine_y_predict = knn_best.predict(wine_x_test)\n", "\n", "accuracy_score (wine_y_test, wine_y_predict)\n", - "\n", - "#0.7333333333333333\n" + "\n" ] }, {