diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 828092657..f49c0d420 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,106 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, + "id": "05336b75", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting pandas\n", + " Downloading pandas-2.3.0-cp312-cp312-win_amd64.whl.metadata (19 kB)\n", + "Requirement already satisfied: numpy in c:\\users\\hashi\\miniconda3\\lib\\site-packages (2.3.0)\n", + "Collecting matplotlib\n", + " Downloading matplotlib-3.10.3-cp312-cp312-win_amd64.whl.metadata (11 kB)\n", + "Collecting scikit-learn\n", + " Downloading scikit_learn-1.7.0-cp312-cp312-win_amd64.whl.metadata (14 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\hashi\\miniconda3\\lib\\site-packages (from pandas) (2.9.0.post0)\n", + "Collecting pytz>=2020.1 (from pandas)\n", + " Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)\n", + "Collecting tzdata>=2022.7 (from pandas)\n", + " Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)\n", + "Collecting contourpy>=1.0.1 (from matplotlib)\n", + " Downloading contourpy-1.3.2-cp312-cp312-win_amd64.whl.metadata (5.5 kB)\n", + "Collecting cycler>=0.10 (from matplotlib)\n", + " Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n", + "Collecting fonttools>=4.22.0 (from matplotlib)\n", + " Downloading fonttools-4.58.4-cp312-cp312-win_amd64.whl.metadata (108 kB)\n", + "Collecting kiwisolver>=1.3.1 (from matplotlib)\n", + " Downloading kiwisolver-1.4.8-cp312-cp312-win_amd64.whl.metadata (6.3 kB)\n", + "Requirement already satisfied: packaging>=20.0 in c:\\users\\hashi\\miniconda3\\lib\\site-packages (from matplotlib) (24.2)\n", + "Collecting pillow>=8 (from matplotlib)\n", + " Downloading pillow-11.2.1-cp312-cp312-win_amd64.whl.metadata (9.1 kB)\n", + "Collecting pyparsing>=2.3.1 (from matplotlib)\n", + " Downloading pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)\n", + "Collecting scipy>=1.8.0 (from scikit-learn)\n", + " Downloading scipy-1.15.3-cp312-cp312-win_amd64.whl.metadata (60 kB)\n", + "Collecting joblib>=1.2.0 (from scikit-learn)\n", + " Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)\n", + "Collecting threadpoolctl>=3.1.0 (from scikit-learn)\n", + " Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\hashi\\miniconda3\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n", + "Downloading pandas-2.3.0-cp312-cp312-win_amd64.whl (11.0 MB)\n", + " ---------------------------------------- 0.0/11.0 MB ? eta -:--:--\n", + " -------- ------------------------------- 2.4/11.0 MB 15.0 MB/s eta 0:00:01\n", + " -------------------- ------------------- 5.5/11.0 MB 14.6 MB/s eta 0:00:01\n", + " ------------------------------ --------- 8.4/11.0 MB 14.5 MB/s eta 0:00:01\n", + " ---------------------------------------- 11.0/11.0 MB 13.7 MB/s eta 0:00:00\n", + "Downloading matplotlib-3.10.3-cp312-cp312-win_amd64.whl (8.1 MB)\n", + " ---------------------------------------- 0.0/8.1 MB ? eta -:--:--\n", + " -------------- ------------------------- 2.9/8.1 MB 12.9 MB/s eta 0:00:01\n", + " ----------------------------- ---------- 6.0/8.1 MB 14.2 MB/s eta 0:00:01\n", + " ---------------------------------------- 8.1/8.1 MB 13.5 MB/s eta 0:00:00\n", + "Downloading scikit_learn-1.7.0-cp312-cp312-win_amd64.whl (10.7 MB)\n", + " ---------------------------------------- 0.0/10.7 MB ? eta -:--:--\n", + " ---------- ----------------------------- 2.9/10.7 MB 14.0 MB/s eta 0:00:01\n", + " ---------------------- ----------------- 6.0/10.7 MB 14.8 MB/s eta 0:00:01\n", + " --------------------------------- ------ 8.9/10.7 MB 14.2 MB/s eta 0:00:01\n", + " ---------------------------------------- 10.7/10.7 MB 13.6 MB/s eta 0:00:00\n", + "Downloading contourpy-1.3.2-cp312-cp312-win_amd64.whl (223 kB)\n", + "Downloading cycler-0.12.1-py3-none-any.whl (8.3 kB)\n", + "Downloading fonttools-4.58.4-cp312-cp312-win_amd64.whl (2.2 MB)\n", + " ---------------------------------------- 0.0/2.2 MB ? eta -:--:--\n", + " ---------------------------------------- 2.2/2.2 MB 14.0 MB/s eta 0:00:00\n", + "Downloading joblib-1.5.1-py3-none-any.whl (307 kB)\n", + "Downloading kiwisolver-1.4.8-cp312-cp312-win_amd64.whl (71 kB)\n", + "Downloading pillow-11.2.1-cp312-cp312-win_amd64.whl (2.7 MB)\n", + " ---------------------------------------- 0.0/2.7 MB ? eta -:--:--\n", + " ---------------------------------------- 2.7/2.7 MB 14.0 MB/s eta 0:00:00\n", + "Downloading pyparsing-3.2.3-py3-none-any.whl (111 kB)\n", + "Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)\n", + "Downloading scipy-1.15.3-cp312-cp312-win_amd64.whl (41.0 MB)\n", + " ---------------------------------------- 0.0/41.0 MB ? eta -:--:--\n", + " -- ------------------------------------- 2.9/41.0 MB 13.9 MB/s eta 0:00:03\n", + " ----- ---------------------------------- 6.0/41.0 MB 14.2 MB/s eta 0:00:03\n", + " -------- ------------------------------- 8.9/41.0 MB 14.2 MB/s eta 0:00:03\n", + " ----------- ---------------------------- 12.1/41.0 MB 14.2 MB/s eta 0:00:03\n", + " -------------- ------------------------- 15.2/41.0 MB 14.3 MB/s eta 0:00:02\n", + " ----------------- ---------------------- 17.8/41.0 MB 14.1 MB/s eta 0:00:02\n", + " -------------------- ------------------- 21.0/41.0 MB 14.1 MB/s eta 0:00:02\n", + " ----------------------- ---------------- 23.9/41.0 MB 14.0 MB/s eta 0:00:02\n", + " -------------------------- ------------- 26.7/41.0 MB 14.1 MB/s eta 0:00:02\n", + " ----------------------------- ---------- 29.9/41.0 MB 14.1 MB/s eta 0:00:01\n", + " -------------------------------- ------- 33.0/41.0 MB 14.1 MB/s eta 0:00:01\n", + " ----------------------------------- ---- 36.2/41.0 MB 14.2 MB/s eta 0:00:01\n", + " -------------------------------------- - 39.1/41.0 MB 14.1 MB/s eta 0:00:01\n", + " --------------------------------------- 40.9/41.0 MB 14.1 MB/s eta 0:00:01\n", + " ---------------------------------------- 41.0/41.0 MB 13.6 MB/s eta 0:00:00\n", + "Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)\n", + "Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)\n", + "Installing collected packages: pytz, tzdata, threadpoolctl, scipy, pyparsing, pillow, kiwisolver, joblib, fonttools, cycler, contourpy, scikit-learn, pandas, matplotlib\n", + "Successfully installed contourpy-1.3.2 cycler-0.12.1 fonttools-4.58.4 joblib-1.5.1 kiwisolver-1.4.8 matplotlib-3.10.3 pandas-2.3.0 pillow-11.2.1 pyparsing-3.2.3 pytz-2025.2 scikit-learn-1.7.0 scipy-1.15.3 threadpoolctl-3.6.0 tzdata-2025.2\n" + ] + } + ], + "source": [ + "!pip install pandas numpy matplotlib scikit-learn\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -57,9 +156,295 @@ { "cell_type": "code", "execution_count": null, - "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", + "id": "b4835719", "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +476,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "178" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Ensure CELL INDEX 4 is executed before running this cell\n", + "# This cell calculates the number of rows in the wine_df DataFrame\n", + "wine_df.shape[0]" ] }, { @@ -107,6 +505,27 @@ "_(ii)_ How many variables (columns) does the dataset contain?" ] }, + { + "cell_type": "code", + "execution_count": 4, + "id": "aa69786d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wine_df.shape[1]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -125,6 +544,27 @@ "_(iii)_ What is the 'variable type' of the response variable `class` (e.g., 'integer', 'category', etc.)? What are the 'levels' (unique values) of the variable?" ] }, + { + "cell_type": "code", + "execution_count": 5, + "id": "efa91f32", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(dtype('int64'), array([0, 1, 2]))" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wine_df['class'].dtype, wine_df['class'].unique()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -144,6 +584,27 @@ "_(iv)_ How many predictor variables do we have (Hint: all variables other than `class`)? " ] }, + { + "cell_type": "code", + "execution_count": 6, + "id": "82670f50", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "13" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wine_df.shape[1] - 1" + ] + }, { "cell_type": "code", "execution_count": null, @@ -175,10 +636,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +692,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "> t's important to standardize the predictor variables because KNN looks at the distance between points to find neighbors. If one feature has much bigger numbers than the others, it can overpower the distance calculation and make the model focus too much on that feature. By standardizing, we put all features on the same scale so they each have a fair impact. " ] }, { @@ -215,6 +703,16 @@ "(ii) Why did we elect not to standard our response variable `Class`?" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c83828af", + "metadata": {}, + "outputs": [], + "source": [ + "> The response variable `class` is categorical and represents discrete labels (0, 1, 2) for the wine types. Standardization is not applicable to categorical variables because it is a technique used to scale continuous numerical data. Standardizing `class` would distort its meaning and make it unsuitable for classification tasks." + ] + }, { "cell_type": "markdown", "id": "fdee5a15", @@ -236,7 +734,7 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "Setting a random seed is like picking a starting point for randomness, so every time you run your code, you get the same random results. This helps you and others get the same answer if you run the code again. The actual number you use for the seed doesn’t matter, as long as you set one.> Your answer here..." ] }, { @@ -251,7 +749,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "72c101f2", "metadata": {}, "outputs": [], @@ -261,7 +759,7 @@ "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", "\n", - "# Your code here ..." + "X_train, X_test, y_train, y_test = train_test_split(predictors_standardized, wine_df['class'], test_size=0.25, random_state=123)" ] }, { @@ -284,12 +782,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "15" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "# Grid search to find the best n_neighbors for KNN using 10-fold cross-validation\n", + "# This process is explained in the YouTube video: https://www.youtube.com/watch?v=Lu7RnTcqn8g\n", + "\n", + "knn = KNeighborsClassifier()\n", + "param_grid = {'n_neighbors': range(1, 51)}\n", + "grid_search = GridSearchCV(knn, param_grid, cv=10)\n", + "grid_search.fit(X_train, y_train)\n", + "grid_search.best_params_['n_neighbors']" ] }, { @@ -305,12 +821,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.9333333333333333" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "# Fit a KNN model using the best n_neighbors found from grid search\n", + "# Then evaluate its accuracy on the test set\n", + "\n", + "knn_best = KNeighborsClassifier(n_neighbors=grid_search.best_params_['n_neighbors'])\n", + "knn_best.fit(X_train, y_train)\n", + "accuracy_score(y_test, knn_best.predict(X_test))" ] }, { @@ -365,7 +897,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "base", "language": "python", "name": "python3" }, @@ -379,12 +911,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.12.9" } }, "nbformat": 4,