diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb
index 593bceaed..555a59971 100644
--- a/02_activities/assignments/assignment_1.ipynb
+++ b/02_activities/assignments/assignment_1.ipynb
@@ -34,7 +34,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "4a3485d6-ba58-4660-a983-5680821c5719",
"metadata": {},
"outputs": [],
@@ -56,10 +56,288 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ " class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 14.23 | \n",
+ " 1.71 | \n",
+ " 2.43 | \n",
+ " 15.6 | \n",
+ " 127.0 | \n",
+ " 2.80 | \n",
+ " 3.06 | \n",
+ " 0.28 | \n",
+ " 2.29 | \n",
+ " 5.64 | \n",
+ " 1.04 | \n",
+ " 3.92 | \n",
+ " 1065.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 13.20 | \n",
+ " 1.78 | \n",
+ " 2.14 | \n",
+ " 11.2 | \n",
+ " 100.0 | \n",
+ " 2.65 | \n",
+ " 2.76 | \n",
+ " 0.26 | \n",
+ " 1.28 | \n",
+ " 4.38 | \n",
+ " 1.05 | \n",
+ " 3.40 | \n",
+ " 1050.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 13.16 | \n",
+ " 2.36 | \n",
+ " 2.67 | \n",
+ " 18.6 | \n",
+ " 101.0 | \n",
+ " 2.80 | \n",
+ " 3.24 | \n",
+ " 0.30 | \n",
+ " 2.81 | \n",
+ " 5.68 | \n",
+ " 1.03 | \n",
+ " 3.17 | \n",
+ " 1185.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 14.37 | \n",
+ " 1.95 | \n",
+ " 2.50 | \n",
+ " 16.8 | \n",
+ " 113.0 | \n",
+ " 3.85 | \n",
+ " 3.49 | \n",
+ " 0.24 | \n",
+ " 2.18 | \n",
+ " 7.80 | \n",
+ " 0.86 | \n",
+ " 3.45 | \n",
+ " 1480.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 13.24 | \n",
+ " 2.59 | \n",
+ " 2.87 | \n",
+ " 21.0 | \n",
+ " 118.0 | \n",
+ " 2.80 | \n",
+ " 2.69 | \n",
+ " 0.39 | \n",
+ " 1.82 | \n",
+ " 4.32 | \n",
+ " 1.04 | \n",
+ " 2.93 | \n",
+ " 735.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 173 | \n",
+ " 13.71 | \n",
+ " 5.65 | \n",
+ " 2.45 | \n",
+ " 20.5 | \n",
+ " 95.0 | \n",
+ " 1.68 | \n",
+ " 0.61 | \n",
+ " 0.52 | \n",
+ " 1.06 | \n",
+ " 7.70 | \n",
+ " 0.64 | \n",
+ " 1.74 | \n",
+ " 740.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 174 | \n",
+ " 13.40 | \n",
+ " 3.91 | \n",
+ " 2.48 | \n",
+ " 23.0 | \n",
+ " 102.0 | \n",
+ " 1.80 | \n",
+ " 0.75 | \n",
+ " 0.43 | \n",
+ " 1.41 | \n",
+ " 7.30 | \n",
+ " 0.70 | \n",
+ " 1.56 | \n",
+ " 750.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 175 | \n",
+ " 13.27 | \n",
+ " 4.28 | \n",
+ " 2.26 | \n",
+ " 20.0 | \n",
+ " 120.0 | \n",
+ " 1.59 | \n",
+ " 0.69 | \n",
+ " 0.43 | \n",
+ " 1.35 | \n",
+ " 10.20 | \n",
+ " 0.59 | \n",
+ " 1.56 | \n",
+ " 835.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 176 | \n",
+ " 13.17 | \n",
+ " 2.59 | \n",
+ " 2.37 | \n",
+ " 20.0 | \n",
+ " 120.0 | \n",
+ " 1.65 | \n",
+ " 0.68 | \n",
+ " 0.53 | \n",
+ " 1.46 | \n",
+ " 9.30 | \n",
+ " 0.60 | \n",
+ " 1.62 | \n",
+ " 840.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 177 | \n",
+ " 14.13 | \n",
+ " 4.10 | \n",
+ " 2.74 | \n",
+ " 24.5 | \n",
+ " 96.0 | \n",
+ " 2.05 | \n",
+ " 0.76 | \n",
+ " 0.56 | \n",
+ " 1.35 | \n",
+ " 9.20 | \n",
+ " 0.61 | \n",
+ " 1.60 | \n",
+ " 560.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
178 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
+ "0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
+ "1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
+ "2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
+ "3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
+ "4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
+ ".. ... ... ... ... ... ... \n",
+ "173 13.71 5.65 2.45 20.5 95.0 1.68 \n",
+ "174 13.40 3.91 2.48 23.0 102.0 1.80 \n",
+ "175 13.27 4.28 2.26 20.0 120.0 1.59 \n",
+ "176 13.17 2.59 2.37 20.0 120.0 1.65 \n",
+ "177 14.13 4.10 2.74 24.5 96.0 2.05 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
+ "0 3.06 0.28 2.29 5.64 1.04 \n",
+ "1 2.76 0.26 1.28 4.38 1.05 \n",
+ "2 3.24 0.30 2.81 5.68 1.03 \n",
+ "3 3.49 0.24 2.18 7.80 0.86 \n",
+ "4 2.69 0.39 1.82 4.32 1.04 \n",
+ ".. ... ... ... ... ... \n",
+ "173 0.61 0.52 1.06 7.70 0.64 \n",
+ "174 0.75 0.43 1.41 7.30 0.70 \n",
+ "175 0.69 0.43 1.35 10.20 0.59 \n",
+ "176 0.68 0.53 1.46 9.30 0.60 \n",
+ "177 0.76 0.56 1.35 9.20 0.61 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline class \n",
+ "0 3.92 1065.0 0 \n",
+ "1 3.40 1050.0 0 \n",
+ "2 3.17 1185.0 0 \n",
+ "3 3.45 1480.0 0 \n",
+ "4 2.93 735.0 0 \n",
+ ".. ... ... ... \n",
+ "173 1.74 740.0 2 \n",
+ "174 1.56 750.0 2 \n",
+ "175 1.56 835.0 2 \n",
+ "176 1.62 840.0 2 \n",
+ "177 1.60 560.0 2 \n",
+ "\n",
+ "[178 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"from sklearn.datasets import load_wine\n",
"\n",
@@ -91,12 +369,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"id": "56916892",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of observations (rows): 178\n"
+ ]
+ }
+ ],
"source": [
- "# Your answer here"
+ "print(f\"Number of observations (rows): {wine_df.shape[0]}\")"
]
},
{
@@ -109,12 +395,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"id": "df0ef103",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of observations (rows): 14\n"
+ ]
+ }
+ ],
"source": [
- "# Your answer here"
+ "print(f\"Number of observations (rows): {wine_df.shape[1]}\")"
]
},
{
@@ -127,12 +421,30 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"id": "47989426",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Variable type of the response variable 'class': int32\n",
+ "Levels (unique values) of the response variable 'class': [0 1 2]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"Variable type of the response variable 'class': {wine_df['class'].dtype}\")\n",
+ "print(f\"Levels (unique values) of the response variable 'class': {wine_df['class'].unique()}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e5646a71",
+ "metadata": {},
"source": [
- "# Your answer here"
+ "The response variable is a categorical variable."
]
},
{
@@ -146,12 +458,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"id": "bd7b0910",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of predictor variables: 13\n"
+ ]
+ }
+ ],
"source": [
- "# Your answer here"
+ "print(f\"Number of predictor variables: {wine_df.shape[1] - 1}\")"
]
},
{
@@ -175,10 +495,37 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "cc899b59",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n",
+ "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n",
+ "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n",
+ "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n",
+ "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n",
+ "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n",
+ "\n",
+ " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n",
+ "0 0.808997 1.034819 -0.659563 1.224884 \n",
+ "1 0.568648 0.733629 -0.820719 -0.544721 \n",
+ "2 0.808997 1.215533 -0.498407 2.135968 \n",
+ "3 2.491446 1.466525 -0.981875 1.032155 \n",
+ "4 0.808997 0.663351 0.226796 0.401404 \n",
+ "\n",
+ " color_intensity hue od280/od315_of_diluted_wines proline \n",
+ "0 0.251717 0.362177 1.847920 1.013009 \n",
+ "1 -0.293321 0.406051 1.113449 0.965242 \n",
+ "2 0.269020 0.318304 0.788587 1.395148 \n",
+ "3 1.186068 -0.427544 1.184071 2.334574 \n",
+ "4 -0.319276 0.362177 0.449601 -0.037874 \n"
+ ]
+ }
+ ],
"source": [
"# Select predictors (excluding the last column)\n",
"predictors = wine_df.iloc[:, :-1]\n",
@@ -204,7 +551,13 @@
"id": "403ef0bb",
"metadata": {},
"source": [
- "> Your answer here..."
+ "> Standardizing helps in maintaining the integrity of the distance calculations, and is often important in preparing data for classification (as in KNN) or prediction. This ensures that no single variable dominates the others due to its scale, leading to better and more reliable classification or prediction results. More specifically:\n",
+ "\n",
+ "Equal Weightage: Standardization ensures that each predictor variable contributes equally to the distance calculations used in KNN. Without standardization, variables with larger ranges (e.g., age in years vs. income in dollars) can disproportionately influence the results.\n",
+ "\n",
+ "Distance Measurement: KNN relies on measuring distances (usually Euclidean) between data points. When predictor variables are on different scales, the distances can be skewed, leading to inaccurate nearest neighbor identification.\n",
+ "\n",
+ "Improved Performance: Standardized data typically improves the performance and accuracy of the KNN algorithm. It helps the algorithm to converge more quickly and find the most relevant neighbors based on a balanced comparison of all predictors."
]
},
{
@@ -220,7 +573,9 @@
"id": "fdee5a15",
"metadata": {},
"source": [
- "> Your answer here..."
+ "The class variable represents a categorical label rather than a continuous numeric predictor. Class represents different categories (0 for one class, 1 for another, 2 for another in this case), not continuous quantities with a meaningful numeric difference.\n",
+ "\n",
+ "Many classification algorithms (including KNN) treat the class variable as a label to predict rather than a (continuous) feature to compare. Standardizing labels would confuse the model and potentially lead to incorrect predictions."
]
},
{
@@ -236,7 +591,17 @@
"id": "f0676c21",
"metadata": {},
"source": [
- "> Your answer here..."
+ "Why is setting a seed important?\n",
+ "\n",
+ "Reproducibility: Setting a random seed ensures that you get the same results every time you run the code (if the same seed is used). This is crucial for debugging and comparing results.\n",
+ "\n",
+ "Consistency: It allows others to replicate your results exactly, making your work more reliable and trustworthy.\n",
+ "\n",
+ "Is the particular seed value important?\n",
+ "\n",
+ "No, the particular seed value itself is not important. Any integer can be used as the seed. However, once you choose a seed, you should stick to it to ensure reproducibility.\n",
+ "\n",
+ "The specific value is just a starting point for the random number generation process; it's the consistency in using the same seed that matters."
]
},
{
@@ -251,15 +616,41 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"id": "72c101f2",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train predictors shape: (140, 13)\n",
+ "Train response shape: (140,)\n",
+ "Test predictors shape: (38, 13)\n",
+ "Test response shape: (38,)\n"
+ ]
+ }
+ ],
"source": [
"# Do not touch\n",
"np.random.seed(123)\n",
"# Create a random vector of True and False values to split the data\n",
- "split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])"
+ "split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])\n",
+ "\n",
+ "# Extending the code:\n",
+ "# Training set\n",
+ "train_predictors = predictors_standardized[split]\n",
+ "train_response = wine_df['class'][split]\n",
+ "\n",
+ "# Test set\n",
+ "test_predictors = predictors_standardized[~split]\n",
+ "test_response = wine_df['class'][~split]\n",
+ "\n",
+ "# Print shapes to verify the split\n",
+ "print('Train predictors shape:', train_predictors.shape)\n",
+ "print('Train response shape:', train_response.shape)\n",
+ "print('Test predictors shape:', test_predictors.shape)\n",
+ "print('Test response shape:', test_response.shape)"
]
},
{
@@ -282,12 +673,35 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "08818c64",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Best value for n_neighbors: 8\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here..."
+ "# Question 3: Model initiation and cross-validation\n",
+ "# 1. Initialize the KNN classifier\n",
+ "knn = KNeighborsClassifier()\n",
+ "\n",
+ "# 2. Define the parameter grid for n_neighbors\n",
+ "param_grid = {'n_neighbors': np.arange(1, 51)}\n",
+ "\n",
+ "# 3. Implement grid search with 10-fold cross-validation\n",
+ "grid_search = GridSearchCV(knn, param_grid, cv=10)\n",
+ "\n",
+ "# Fit the model on the training data\n",
+ "grid_search.fit(train_predictors, train_response)\n",
+ "\n",
+ "# 4. Identify and return the best value for n_neighbors\n",
+ "best_n_neighbors = grid_search.best_params_['n_neighbors']\n",
+ "print('Best value for n_neighbors:', best_n_neighbors)"
]
},
{
@@ -303,12 +717,32 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"id": "ffefa9f2",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Test set accuracy: 0.9473684210526315\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here..."
+ "# Question 4: Model evaluation\n",
+ "# Initialize the KNN classifier with the best n_neighbors\n",
+ "knn_best = KNeighborsClassifier(n_neighbors=8)\n",
+ "\n",
+ "# Fit the model on the training data\n",
+ "knn_best.fit(train_predictors, train_response)\n",
+ "\n",
+ "# Predict the labels on the test set\n",
+ "test_predictions = knn_best.predict(test_predictors)\n",
+ "\n",
+ "# Evaluate the performance using accuracy_score\n",
+ "accuracy = accuracy_score(test_response, test_predictions)\n",
+ "print('Test set accuracy:', accuracy)"
]
},
{
@@ -363,7 +797,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3.10.4",
+ "display_name": "dsi_participant",
"language": "python",
"name": "python3"
},
@@ -377,12 +811,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.19"
- },
- "vscode": {
- "interpreter": {
- "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e"
- }
+ "version": "3.9.15"
}
},
"nbformat": 4,