diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 828092657..9b5a1cd00 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +56,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +369,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of observations (rows): 178\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "# Number of observations (rows)\n", + "num_rows = wine_df.shape[0]\n", + "print(f\"Number of observations (rows): {num_rows}\")" ] }, { @@ -109,12 +399,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of variables (columns): 14\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "# Number of variables (columns)\n", + "num_columns = wine_df.shape[1]\n", + "print(f\"Number of variables (columns): {num_columns}\")" ] }, { @@ -127,12 +429,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variable type of 'class': int64\n", + "Levels (unique values) of 'class': [0 1 2]\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "# Check data type of 'class' column\n", + "class_dtype = wine_df['class'].dtype\n", + "\n", + "# Get unique values of the 'class' column\n", + "class_levels = wine_df['class'].unique()\n", + "\n", + "# Sort \n", + "class_levels_sorted = np.sort(class_levels)\n", + "\n", + "print(f\"Variable type of 'class': {class_dtype}\")\n", + "print(f\"Levels (unique values) of 'class': {class_levels_sorted}\")\n" ] }, { @@ -146,12 +469,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of predictor variables: 13\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "\n", + "num_predictors = wine_df.drop(columns=['class']).shape[1]\n", + "print(f\"Number of predictor variables: {num_predictors}\")\n" ] }, { @@ -175,10 +510,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +566,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + ">>>### It is important for the KNN algorithm so to avoid any biases and incorrect predictions because if the variable is not standardized, the ranges may vary hugely for different features which would lead to inaccurate results. " ] }, { @@ -220,7 +582,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + ">>>### Because it is not continuous or numerical feature, it is categorical and used for labelling, not for calculations." ] }, { @@ -236,7 +598,13 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "### random.seed(20)\n", + "### np.random.seed(20)\n", + "\n", + "### It is important as it ensures the same random results and keeps it consistent and comparable.\n", + "\n", + "### The seed value is not important and it can be any integer.\n", + "### Using the consistent seed value helps with reproducibility. Even though the value of seed is not important, but whatever the valu is selected should be fixed for consistent and repeatable results.\n" ] }, { @@ -251,17 +619,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "72c101f2", "metadata": {}, "outputs": [], "source": [ "# set a seed for reproducibility\n", - "np.random.seed(123)\n", + "np.random.seed(20)\n", + "\n", "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " predictors_standardized, \n", + " wine_df['class'], \n", + " test_size=0.2, \n", + " random_state=20, \n", + " stratify=wine_df['class']\n", + " )\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c793e96", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overlap in X indices: 0\n", + "Overlap in y indices: 0\n" + ] + } + ], + "source": [ + "# Confirm the split is non-overlapping by checking for shared indices\n", + "overlap_X = X_train.index.intersection(X_test.index)\n", + "overlap_y = y_train.index.intersection(y_test.index)\n", "\n", - "# Your code here ..." + "print(f\"Overlap in X indices: {len(overlap_X)}\") # Should print 0\n", + "print(f\"Overlap in y indices: {len(overlap_y)}\") # Should print 0" ] }, { @@ -284,12 +683,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best value for n_neighbors: 22\n" + ] + } + ], "source": [ - "# Your code here..." + "# Your code here..\n", + "\n", + "# Step 1 : Create a KNN classifier\n", + "knn = KNeighborsClassifier()\n", + "\n", + "# Step 2 : Parameter grid\n", + "param_grid = {\n", + " 'n_neighbors': list(range(1, 51))}\n", + "\n", + "# Step 3 : Create a GridSearchCV object\n", + "grid_search = GridSearchCV(estimator=knn,\n", + " param_grid=param_grid,\n", + " cv=10,\n", + " scoring='accuracy',) \n", + "\n", + "\n", + "# Step 4 : Fit the model\n", + "grid_search.fit(X_train, y_train)\n", + "# Step 5 : Get the best parameters\n", + "best_n_neighbors = grid_search.best_params_['n_neighbors']\n", + "\n", + "print(\"Best value for n_neighbors:\", best_n_neighbors)\n", + "\n" ] }, { @@ -305,12 +734,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test accuracy with n_neighbors = 22: 0.9722\n" + ] + } + ], "source": [ - "# Your code here..." + "# Your code here...\n", + "\n", + "# Step 1 : Create a KNN classifier with the best n_neighbors\n", + "knn_best = KNeighborsClassifier(n_neighbors = 22)\n", + "knn_best.fit(X_train, y_train)\n", + "\n", + "# Step 2 : predictions on the test set\n", + "y_pred = knn_best.predict(X_test)\n", + "\n", + "# Step 3 : Calculate accuracy\n", + "test_accuracy = accuracy_score(y_test, y_pred)\n", + "print(f\"Test accuracy with n_neighbors = 22: {test_accuracy:.4f}\")" ] }, { @@ -365,7 +813,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "dsi_participant", "language": "python", "name": "python3" }, @@ -379,12 +827,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.9.7" } }, "nbformat": 4,