diff --git a/01_materials/notebooks/Classification-1.ipynb b/01_materials/notebooks/Classification-1.ipynb index 7b6959a7a..dc4087017 100644 --- a/01_materials/notebooks/Classification-1.ipynb +++ b/01_materials/notebooks/Classification-1.ipynb @@ -2326,7 +2326,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -2340,7 +2340,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 28d4df017..a9f53a6d3 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,146 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, + "id": "568de9a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/Bakhtiari/Desktop/Assignments/LCR/lcr-env/bin/python\n" + ] + } + ], + "source": [ + "import sys\n", + "print(sys.executable)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7d4bc334", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "832.65s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/Bakhtiari/Desktop/Assignments/LCR/lcr-env/bin/python: No module named pip\n" + ] + } + ], + "source": [ + "!{sys.executable} -m pip install pandas matplotlib\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c9143701", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/Bakhtiari/Desktop/Assignments/LCR/lcr-env/bin/python\n" + ] + } + ], + "source": [ + "import sys\n", + "print(sys.executable)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c4ec61dd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Matplotlib is building the font cache; this may take a moment.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a81882e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/Bakhtiari/Desktop/Assignments/LCR/lcr-env/bin/python\n" + ] + } + ], + "source": [ + "import sys\n", + "print(sys.executable)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7b332128", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting scikit-learn\n", + " Downloading scikit_learn-1.8.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (11 kB)\n", + "Requirement already satisfied: numpy>=1.24.1 in /Users/Bakhtiari/Desktop/Assignments/LCR/lcr-env/lib/python3.11/site-packages (from scikit-learn) (2.3.5)\n", + "Collecting scipy>=1.10.0 (from scikit-learn)\n", + " Downloading scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)\n", + "Collecting joblib>=1.3.0 (from scikit-learn)\n", + " Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)\n", + "Collecting threadpoolctl>=3.2.0 (from scikit-learn)\n", + " Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)\n", + "Downloading scikit_learn-1.8.0-cp311-cp311-macosx_12_0_arm64.whl (8.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.1/8.1 MB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hDownloading joblib-1.5.3-py3-none-any.whl (309 kB)\n", + "Downloading scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl (20.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.9/20.9 MB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m \u001b[33m0:00:01\u001b[0mm0:00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hDownloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)\n", + "Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4/4\u001b[0m [scikit-learn][0m [scikit-learn]\n", + "\u001b[1A\u001b[2KSuccessfully installed joblib-1.5.3 scikit-learn-1.8.0 scipy-1.16.3 threadpoolctl-3.6.0\n" + ] + } + ], + "source": [ + "!{sys.executable} -m pip install scikit-learn\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +195,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +508,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "178" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "wine_df.shape[0]" ] }, { @@ -109,12 +538,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "wine_df.shape[1]" ] }, { @@ -127,12 +568,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('int64')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "wine_df['class'].dtype" ] }, { @@ -146,12 +599,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "13" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "wine_df.drop(columns='class').shape[1]" ] }, { @@ -175,10 +640,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +696,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "KNN is based on distance, so predictors on larger scales can dominate the distance calculation. Standardisation ensures all predictors contribue as equal." ] }, { @@ -220,7 +712,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "'class' is categorical variable, not a continous predictor, therefore, standardisatio nis not meaningful here. " ] }, { @@ -236,7 +728,7 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "A random seed ensures reproducible results. The specific seed value is not important, as long as it is set consistently." ] }, { @@ -251,7 +743,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "72c101f2", "metadata": {}, "outputs": [], @@ -261,7 +753,14 @@ "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", "\n", - "# Your code here ..." + "# Your code here ...\n", + "\n", + "X = predictors_standardized\n", + "y = wine_df['class']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.25, random_state=123, stratify=y\n", + ")" ] }, { @@ -277,6 +776,7 @@ "Perform a grid search to tune the `n_neighbors` hyperparameter using 10-fold cross-validation. Follow these steps:\n", "\n", "1. Initialize the KNN classifier using `KNeighborsClassifier()`.\n", + "\n", "2. Define a parameter grid for `n_neighbors` ranging from 1 to 50.\n", "3. Implement a grid search using `GridSearchCV` with 10-fold cross-validation to find the optimal number of neighbors.\n", "4. After fitting the model on the training data, identify and return the best value for `n_neighbors` based on the grid search results." @@ -284,12 +784,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(7)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "import numpy as np\n", + "\n", + "knn = KNeighborsClassifier() #step 1 to define the parameter\n", + "\n", + "parameter_grid = {'n_neighbors': np.arange(1,51)}\n", + "\n", + "grid = GridSearchCV(\n", + " estimator=knn,\n", + " param_grid=parameter_grid,\n", + " cv=10,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1\n", + ")\n", + "\n", + "\n", + "\n", + "grid.fit(X_train, y_train)\n", + "\n", + "\n", + "best_k = grid.best_params_[\"n_neighbors\"]\n", + "best_k\n", + "\n", + "\n", + "\n" ] }, { @@ -305,12 +841,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.9333333333333333" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "best_knn = KNeighborsClassifier(n_neighbors=best_k)\n", + "best_knn.fit(X_train, y_train)\n", + "\n", + "\n", + "y_pred = best_knn.predict(X_test)\n", + "accuracy_score(y_test, y_pred)" ] }, { @@ -365,7 +917,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "lcr-env (3.11.14)", "language": "python", "name": "python3" }, @@ -379,12 +931,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.11.14" } }, "nbformat": 4,