Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 183 additions & 26 deletions 02_activities/assignments/assignment_1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 35,
"id": "4a3485d6-ba58-4660-a983-5680821c5719",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -96,7 +96,14 @@
"metadata": {},
"outputs": [],
"source": [
"# Your answer here"
"# Your answer here: 178\n",
"\n",
"# Inspect the whole dataset\n",
"wine_df.info()\n",
"\n",
"# Only show the number of rows. (Copilot, 2025)\n",
"num_rows = wine_df.shape[0]\n",
"print(num_rows)"
]
},
{
Expand All @@ -114,7 +121,14 @@
"metadata": {},
"outputs": [],
"source": [
"# Your answer here"
"# Your answer here: 14\n",
"\n",
"# Inspect the whole dataset\n",
"wine_df.info()\n",
"\n",
"# Only show the number of columns (Copilot, 2025)\n",
"num_columns = wine_df.shape[1]\n",
"print(num_columns)"
]
},
{
Expand All @@ -127,12 +141,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 30,
"id": "47989426",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"dtype('int64')"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Your answer here"
"# Your answer here: integer, 0,1,2\n",
"\n",
"# The data type did not show in the following code\n",
"wine_df[\"class\"].unique()\n",
"\n",
"# Return the variable type\n",
"wine_df[\"class\"].dtype\n"
]
},
{
Expand All @@ -146,12 +177,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"id": "bd7b0910",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"13\n"
]
}
],
"source": [
"# Your answer here"
"# Your answer here: 13\n",
"\n",
"num_predictors = wine_df.drop(columns=['class']).shape[1]\n",
"print(num_predictors)\n"
]
},
{
Expand Down Expand Up @@ -204,7 +246,7 @@
"id": "403ef0bb",
"metadata": {},
"source": [
"> Your answer here..."
"If the result is based on different scales of predictor variables, the results can be biased because the machine learning model is based on distance metrics. "
]
},
{
Expand All @@ -220,7 +262,7 @@
"id": "fdee5a15",
"metadata": {},
"source": [
"> Your answer here..."
"Because this is the response variable we are going to predict, not the predictor variable. "
]
},
{
Expand All @@ -236,7 +278,7 @@
"id": "f0676c21",
"metadata": {},
"source": [
"> Your answer here..."
"Setting a random seed can help produce the training and test datasets unbiased and also ensure that the following random numbers are reproducible. It is essential to use the same seed so that results are consistent for testing and comparisons. "
]
},
{
Expand All @@ -257,16 +299,46 @@
"outputs": [],
"source": [
"# set a seed for reproducibility\n",
"np.random.seed(123)\n",
"np.random.seed(123)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8ab6cdd",
"metadata": {},
"outputs": [],
"source": [
"# Scale the data and bring back the column [class], using the course poython code\n",
"standardized_wine= wine_df.copy()\n",
"columns_to_exclude =['class']\n",
"columns_to_scale= standardized_wine.columns.difference(columns_to_exclude)\n",
"scaler=StandardScaler()\n",
"standardized_wine[columns_to_scale] = scaler.fit_transform(wine_df[columns_to_scale])\n",
"\n",
"standardized_wine"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "95944439",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# split the data into a training and testing set. hint: use train_test_split !\n",
"# Your code here ...\n",
"\n",
"# Your code here ..."
"wine_train, wine_test =train_test_split(\n",
" standardized_wine, train_size=0.75, \n",
" stratify=standardized_wine[\"class\"]\n",
")"
]
},
{
"cell_type": "markdown",
"id": "4604ee03",
"id": "fdfed2d0",
"metadata": {},
"source": [
"#### **Question 3:**\n",
Expand All @@ -289,7 +361,74 @@
"metadata": {},
"outputs": [],
"source": [
"# Your code here..."
"# Your code here...\n",
"# The best value for the n_neighbors is 5\n",
"\n",
"knn=KNeighborsClassifier()\n",
"\n",
"# Define a parameter grid, ranging form 1-50 and run the 10-fold cross-validaiton \n",
"parameter_grid = {\n",
" \"n_neighbors\": range(1,51),\n",
"}\n",
"wine_tune_grid=GridSearchCV(\n",
"estimator=knn,\n",
"param_grid=parameter_grid,\n",
"cv=10\n",
")\n",
"\n",
"# Group variables for x_train and y_train\n",
"X_train = wine_train[wine_train.columns[0:13]] \n",
"y_train = wine_train[\"class\"]\n",
"\n",
"# Fit the model \n",
"wine_tune_grid.fit(X_train, y_train)\n",
"\n",
"accuracies_grid= pd.DataFrame(wine_tune_grid.cv_results_)\n",
"accuracies_grid"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f79ae62",
"metadata": {},
"outputs": [],
"source": [
"# The following codes for the plots are from the sample codes in the lecture\n",
"# Create the plot\n",
"plt.figure(figsize=(10, 6))\n",
"\n",
"# Plot mean test scores with error bars\n",
"plt.plot(accuracies_grid['param_n_neighbors'], accuracies_grid['mean_test_score'], '-o', color='blue')\n",
"\n",
"# Add labels and legend\n",
"plt.xlabel('Number of Neighbors')\n",
"plt.ylabel('Accuracy estimate')\n",
"plt.title('K-Nearest Neighbors Performance')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "c4ebf15c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'n_neighbors': 5}"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#show the best n_neighbors\n",
"wine_tune_grid.best_params_"
]
},
{
Expand All @@ -305,12 +444,35 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 96,
"id": "ffefa9f2",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"0.9777777777777777"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Your code here..."
"# Your code here...\n",
"knn = KNeighborsClassifier(n_neighbors=5)\n",
"\n",
"knn.fit(X=X_train, y=y_train)\n",
"\n",
"X_test = wine_test[wine_test.columns[0:13]] \n",
"y_test = wine_test[\"class\"]\n",
"\n",
"wine_test[\"predicted\"] = knn.predict(X_test)\n",
"wine_test[[\"class\", \"predicted\"]]\n",
"\n",
"#Accuracy\n",
"knn.score(X_test,y_test)"
]
},
{
Expand Down Expand Up @@ -365,7 +527,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.4",
"display_name": "lcr-env",
"language": "python",
"name": "python3"
},
Expand All @@ -379,12 +541,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
},
"vscode": {
"interpreter": {
"hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e"
}
"version": "3.11.13"
}
},
"nbformat": 4,
Expand Down
Loading