diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 28d4df017..74ffcc850 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,10 +34,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Matplotlib is building the font cache; this may take a moment.\n" + ] + } + ], "source": [ "# Import standard libraries\n", "import pandas as pd\n", @@ -56,10 +64,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +377,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "178" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "wine_df.shape[0]" ] }, { @@ -109,12 +407,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "wine_df.shape [1]" ] }, { @@ -127,12 +437,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('int64')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "wine_df ['class'].dtype" ] }, { @@ -146,12 +468,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "13" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "len (wine_df.columns)-1" ] }, { @@ -175,10 +509,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +565,8 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "> \n", + "When multiple variables are accounted for computation, the scale of each variables may be different depending on the unit/measure methods etc. Thus,if one of the variables contains extreme large or small scale, it may shift the entire model and impact the accuracy.In order to minimize the impact of scale on modelling and reflect the actual pattern based on data, we have to standardize the predictor variables." ] }, { @@ -220,7 +582,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "Because it is a categorical variable, not a continuous numeric variable. Standardizing it would destroy its meaning, since the numbers (0, 1, 2) represent categories, not magnitudes." ] }, { @@ -236,7 +598,13 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "> \n", + "random.seed(123)\n", + "np.random.seed(123)\n", + "\n", + "Each time running random selection function will generate different combination of rows for later computation; setting seed is important to ensure the same combination of rows are selected each time we rerun the code, which is helpful for others to replicate the same result each time. \n", + "\n", + "The particular seed value picked at the first time is not important (it is a random number accounting for a specific set of random combinations); however, when rerun the code for replication, the seed value has to be the same to generate the same result." ] }, { @@ -251,17 +619,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "72c101f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X_train: (133, 13)\n", + "X_test: (45, 13)\n", + "y_train: (133,)\n", + "y_test: (45,)\n" + ] + } + ], "source": [ "# set a seed for reproducibility\n", "np.random.seed(123)\n", "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", + "from sklearn.model_selection import train_test_split\n", + "x = predictors_standardized\n", + "y = wine_df['class']\n", + "X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=123)\n", "\n", - "# Your code here ..." + "# print the shapes of the resulting datasets for verification\n", + "print(\"X_train:\", X_train.shape)\n", + "print(\"X_test:\", X_test.shape)\n", + "print(\"y_train:\", y_train.shape)\n", + "print(\"y_test:\", y_test.shape)" ] }, { @@ -284,12 +671,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best number of neighbors (k): 15\n", + "Best cross-validation accuracy: 0.9846153846153847\n" + ] + } + ], "source": [ - "# Your code here..." + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "import numpy as np\n", + "\n", + "# Initialize KNN classifier\n", + "knn = KNeighborsClassifier()\n", + "# Define the parameter grid for 'n_neighbors' from 1 to 50\n", + "param_grid = {'n_neighbors': np.arange(1, 51)}\n", + "# Set up GridSearchCV with 10-fold cross-validation\n", + "grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy',)\n", + "# Fit the model on the training data\n", + "grid_search.fit(X_train, y_train)\n", + "# Get best hyperparameter and CV score\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "best_score = grid_search.best_score_\n", + "\n", + "print(\"Best number of neighbors (k):\", best_k)\n", + "print(\"Best cross-validation accuracy:\", best_score)" ] }, { @@ -305,12 +718,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best k from grid search: 15\n", + "Test set accuracy: 0.9333333333333333\n" + ] + } + ], "source": [ - "# Your code here..." + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import accuracy_score \n", + "\n", + "# Extract best.k from the grid search results\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "print(\"Best k from grid search:\", best_k)\n", + "# Train KNN with the best k on the training data\n", + "final_knn = KNeighborsClassifier(n_neighbors=best_k)\n", + "final_knn.fit (X_train, y_train)\n", + "# Predict on the test data based on the trained model\n", + "y_pred = final_knn.predict(X_test)\n", + "# Evaluate accuracy on the test set\n", + "test_accuracy = accuracy_score (y_test, y_pred)\n", + "print(\"Test set accuracy:\", test_accuracy)" ] }, { @@ -365,7 +800,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "lcr-env", "language": "python", "name": "python3" }, @@ -379,12 +814,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.11.13" } }, "nbformat": 4,