diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 28d4df017..487117730 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -96,7 +96,14 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# Your answer here: 178\n", + "\n", + "# Inspect the whole dataset\n", + "wine_df.info()\n", + "\n", + "# Only show the number of rows. (Copilot, 2025)\n", + "num_rows = wine_df.shape[0]\n", + "print(num_rows)" ] }, { @@ -114,7 +121,14 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# Your answer here: 14\n", + "\n", + "# Inspect the whole dataset\n", + "wine_df.info()\n", + "\n", + "# Only show the number of columns (Copilot, 2025)\n", + "num_columns = wine_df.shape[1]\n", + "print(num_columns)" ] }, { @@ -127,12 +141,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('int64')" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here: integer, 0,1,2\n", + "\n", + "# The data type did not show in the following code\n", + "wine_df[\"class\"].unique()\n", + "\n", + "# Return the variable type\n", + "wine_df[\"class\"].dtype\n" ] }, { @@ -146,12 +177,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here: 13\n", + "\n", + "num_predictors = wine_df.drop(columns=['class']).shape[1]\n", + "print(num_predictors)\n" ] }, { @@ -204,7 +246,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "If the result is based on different scales of predictor variables, the results can be biased because the machine learning model is based on distance metrics. " ] }, { @@ -220,7 +262,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "Because this is the response variable we are going to predict, not the predictor variable. " ] }, { @@ -236,7 +278,7 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "Setting a random seed can help produce the training and test datasets unbiased and also ensure that the following random numbers are reproducible. It is essential to use the same seed so that results are consistent for testing and comparisons. " ] }, { @@ -257,16 +299,46 @@ "outputs": [], "source": [ "# set a seed for reproducibility\n", - "np.random.seed(123)\n", + "np.random.seed(123)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8ab6cdd", + "metadata": {}, + "outputs": [], + "source": [ + "# Scale the data and bring back the column [class], using the course poython code\n", + "standardized_wine= wine_df.copy()\n", + "columns_to_exclude =['class']\n", + "columns_to_scale= standardized_wine.columns.difference(columns_to_exclude)\n", + "scaler=StandardScaler()\n", + "standardized_wine[columns_to_scale] = scaler.fit_transform(wine_df[columns_to_scale])\n", + "\n", + "standardized_wine" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "95944439", + "metadata": {}, + "outputs": [], + "source": [ "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", + "# Your code here ...\n", "\n", - "# Your code here ..." + "wine_train, wine_test =train_test_split(\n", + " standardized_wine, train_size=0.75, \n", + " stratify=standardized_wine[\"class\"]\n", + ")" ] }, { "cell_type": "markdown", - "id": "4604ee03", + "id": "fdfed2d0", "metadata": {}, "source": [ "#### **Question 3:**\n", @@ -289,7 +361,74 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here..." + "# Your code here...\n", + "# The best value for the n_neighbors is 5\n", + "\n", + "knn=KNeighborsClassifier()\n", + "\n", + "# Define a parameter grid, ranging form 1-50 and run the 10-fold cross-validaiton \n", + "parameter_grid = {\n", + " \"n_neighbors\": range(1,51),\n", + "}\n", + "wine_tune_grid=GridSearchCV(\n", + "estimator=knn,\n", + "param_grid=parameter_grid,\n", + "cv=10\n", + ")\n", + "\n", + "# Group variables for x_train and y_train\n", + "X_train = wine_train[wine_train.columns[0:13]] \n", + "y_train = wine_train[\"class\"]\n", + "\n", + "# Fit the model \n", + "wine_tune_grid.fit(X_train, y_train)\n", + "\n", + "accuracies_grid= pd.DataFrame(wine_tune_grid.cv_results_)\n", + "accuracies_grid" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f79ae62", + "metadata": {}, + "outputs": [], + "source": [ + "# The following codes for the plots are from the sample codes in the lecture\n", + "# Create the plot\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "# Plot mean test scores with error bars\n", + "plt.plot(accuracies_grid['param_n_neighbors'], accuracies_grid['mean_test_score'], '-o', color='blue')\n", + "\n", + "# Add labels and legend\n", + "plt.xlabel('Number of Neighbors')\n", + "plt.ylabel('Accuracy estimate')\n", + "plt.title('K-Nearest Neighbors Performance')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "c4ebf15c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_neighbors': 5}" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#show the best n_neighbors\n", + "wine_tune_grid.best_params_" ] }, { @@ -305,12 +444,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 96, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.9777777777777777" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "# Your code here...\n", + "knn = KNeighborsClassifier(n_neighbors=5)\n", + "\n", + "knn.fit(X=X_train, y=y_train)\n", + "\n", + "X_test = wine_test[wine_test.columns[0:13]] \n", + "y_test = wine_test[\"class\"]\n", + "\n", + "wine_test[\"predicted\"] = knn.predict(X_test)\n", + "wine_test[[\"class\", \"predicted\"]]\n", + "\n", + "#Accuracy\n", + "knn.score(X_test,y_test)" ] }, { @@ -365,7 +527,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "lcr-env", "language": "python", "name": "python3" }, @@ -379,12 +541,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.11.13" } }, "nbformat": 4,