From 9dcbfd0b78fad509aa51bbef26835e21f369f0ee Mon Sep 17 00:00:00 2001 From: m293wang Date: Sat, 11 Jan 2025 12:39:00 -0500 Subject: [PATCH 1/2] first rev --- 02_activities/assignments/assignment_1.ipynb | 460 +++++++++++++++++-- 1 file changed, 428 insertions(+), 32 deletions(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 73d92a3ee..fb445e249 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +56,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +369,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "178" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "len(wine_df)" ] }, { @@ -109,12 +399,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "len(wine_df.columns)" ] }, { @@ -127,12 +429,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The variable type is: int32 and the unique values are: [0 1 2]\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "print(f'The variable type is: {wine_df[\"class\"].dtype} and the unique values are: {wine_df[\"class\"].unique()}')" ] }, { @@ -151,7 +462,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# Your answer here\n", + "13" ] }, { @@ -175,10 +487,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +543,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "Without standardization, results could be scewed by the magnitude of the values instead of the true weight" ] }, { @@ -220,7 +559,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "Class is the prediction, not an explanatory variable" ] }, { @@ -236,7 +575,10 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "Setting a seed is important to control randomness in our data split.\n", + "The particular seed value is not important because it does not affect accuracy, it's just a way to make sure the split is random each time while ensuring repeatable results each time it's run.\n", + "\n", + "Syntax for setting a random seed: np.random.seed(#) (the seed is being set in the next code block, so I'm assuming we just provide the syntax here)" ] }, { @@ -251,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "id": "72c101f2", "metadata": {}, "outputs": [], @@ -259,9 +601,21 @@ "# set a seed for reproducibility\n", "np.random.seed(123)\n", "\n", +<<<<<<< Updated upstream "# split the data into a training and testing set. hint: use train_test_split !\n", "\n", "# Your code here ..." +======= + "# Create a random vector of True and False values to split the data\n", + "split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])\n", + "\n", + "X_train = predictors_standardized[split]\n", + "X_test = predictors_standardized[~split]\n", + "\n", + "class_target = wine_df['class']\n", + "y_train = class_target[split]\n", + "y_test = class_target[~split]" +>>>>>>> Stashed changes ] }, { @@ -284,12 +638,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_neighbors': 7}" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "#initialize knn clasifier\n", + "knn_classifier = KNeighborsClassifier()\n", + "\n", + "#define parameter grid\n", + "wine_param_grid = {\n", + " 'n_neighbors':range(1, 50, 2)\n", + "}\n", + "\n", + "#perform grid search\n", + "wine_grid_search = GridSearchCV(\n", + " estimator = knn_classifier,\n", + " param_grid = wine_param_grid,\n", + " cv = 10,\n", + ")\n", + "\n", + "#fit grid search to training data\n", + "wine_grid_search.fit(X_train, y_train)\n", + "\n", + "#return best value for k\n", + "wine_grid_search.best_params_" ] }, { @@ -305,12 +689,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.9736842105263158" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "knn_classifier = KNeighborsClassifier(n_neighbors=7)\n", + "knn_classifier.fit(X_train, y_train)\n", + "accuracy_score = knn_classifier.score(\n", + " X_test,\n", + " y_test\n", + ")\n", + "accuracy_score" ] }, { @@ -365,7 +766,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "dsi_participant", "language": "python", "name": "python3" }, @@ -379,12 +780,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.9.15" } }, "nbformat": 4, From 289580922f19b8a558349e0f33aa797e66fb5e3b Mon Sep 17 00:00:00 2001 From: m293wang Date: Sat, 11 Jan 2025 12:43:24 -0500 Subject: [PATCH 2/2] fixing file errors --- 02_activities/assignments/assignment_1.ipynb | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index fb445e249..56ed912b3 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -598,14 +598,9 @@ "metadata": {}, "outputs": [], "source": [ - "# set a seed for reproducibility\n", + "# Do not touch\n", "np.random.seed(123)\n", "\n", -<<<<<<< Updated upstream - "# split the data into a training and testing set. hint: use train_test_split !\n", - "\n", - "# Your code here ..." -======= "# Create a random vector of True and False values to split the data\n", "split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])\n", "\n", @@ -615,7 +610,6 @@ "class_target = wine_df['class']\n", "y_train = class_target[split]\n", "y_test = class_target[~split]" ->>>>>>> Stashed changes ] }, { @@ -747,11 +741,11 @@ "If you like, you may collaborate with others in the cohort. If you choose to do so, please indicate with whom you have worked with in your pull request by tagging their GitHub username. Separate submissions are required.\n", "\n", "### Submission Parameters:\n", - "* Submission Due Date: `HH:MM AM/PM - DD/MM/YYYY`\n", + "* Submission Due Date: `11:59 PM - 01/12/2025`\n", "* The branch name for your repo should be: `assignment-1`\n", "* What to submit for this assignment:\n", " * This Jupyter Notebook (assignment_1.ipynb) should be populated and should be the only change in your pull request.\n", - "* What the pull request link should look like for this assignment: `https://github.com//applying_statistical_concepts/pull/`\n", + "* What the pull request link should look like for this assignment: `https://github.com//LCR/pull/`\n", " * Open a private window in your browser. Copy and paste the link to your pull request into the address bar. Make sure you can see your pull request properly. This helps the technical facilitator and learning support staff review your submission easily.\n", "\n", "Checklist:\n",