diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 28d4df017..a3b633b81 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +56,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -94,9 +372,39 @@ "execution_count": null, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 178 entries, 0 to 177\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 178 non-null float64\n", + " 1 malic_acid 178 non-null float64\n", + " 2 ash 178 non-null float64\n", + " 3 alcalinity_of_ash 178 non-null float64\n", + " 4 magnesium 178 non-null float64\n", + " 5 total_phenols 178 non-null float64\n", + " 6 flavanoids 178 non-null float64\n", + " 7 nonflavanoid_phenols 178 non-null float64\n", + " 8 proanthocyanins 178 non-null float64\n", + " 9 color_intensity 178 non-null float64\n", + " 10 hue 178 non-null float64\n", + " 11 od280/od315_of_diluted_wines 178 non-null float64\n", + " 12 proline 178 non-null float64\n", + " 13 class 178 non-null int64 \n", + "dtypes: float64(13), int64(1)\n", + "memory usage: 19.6 KB\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "#it has 178 observation rows and 14 columns\n", + "wine_df.info()" ] }, { @@ -109,12 +417,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of columns: 14\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "# The dataset contains 14 columns\n", + "print(\"Total number of columns: \", wine_df.shape[1])" ] }, { @@ -127,12 +446,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "#class is an Integer type with 3 uniquie values\n", + "\n", + "wine_df['class'].unique()" ] }, { @@ -146,12 +479,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of predictor variables: 13\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "#we have 13 predictor variable\n", + "print(\"number of predictor variables: \", wine_df.shape[1] - 1)" ] }, { @@ -175,10 +518,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +574,9 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "> Your answer here...\n", + "\n", + "it is important because predictors may be measured on different scales which can distort the model. Standardization therefore puts everything on the same scale, so the model treats each variable fairly and one scale is not overpowering the distance calculation therefore misleading results" ] }, { @@ -220,7 +592,8 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "> Your answer here...\n", + "because the y (response variable) is what we are trying to predict so it would be unnecessary to standardize it if it is not a predictor." ] }, { @@ -236,7 +609,8 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "> Your answer here...\n", + "Setting a random seed is important because it makes the randomly generated operations repeatable so that we get the same result for replicability and consistency purposes. the particular seed value is not important but needs to be consistent." ] }, { @@ -251,17 +625,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "72c101f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 133 entries, 28 to 109\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 133 non-null float64\n", + " 1 malic_acid 133 non-null float64\n", + " 2 ash 133 non-null float64\n", + " 3 alcalinity_of_ash 133 non-null float64\n", + " 4 magnesium 133 non-null float64\n", + " 5 total_phenols 133 non-null float64\n", + " 6 flavanoids 133 non-null float64\n", + " 7 nonflavanoid_phenols 133 non-null float64\n", + " 8 proanthocyanins 133 non-null float64\n", + " 9 color_intensity 133 non-null float64\n", + " 10 hue 133 non-null float64\n", + " 11 od280/od315_of_diluted_wines 133 non-null float64\n", + " 12 proline 133 non-null float64\n", + " 13 class 133 non-null int64 \n", + "dtypes: float64(13), int64(1)\n", + "memory usage: 15.6 KB\n" + ] + } + ], "source": [ "# set a seed for reproducibility\n", "np.random.seed(123)\n", "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", + "Wine_train, Wine_test = train_test_split(wine_df, train_size=0.75, shuffle = True) \n", "\n", - "# Your code here ..." + "# Your code here ...\n", + "Wine_train.info()\n" ] }, { @@ -284,12 +688,77 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fit_timescore_timetest_score
mean0.0198260.0290090.743956
sem0.0105780.0122470.022893
\n", + "
" + ], + "text/plain": [ + " fit_time score_time test_score\n", + "mean 0.019826 0.029009 0.743956\n", + "sem 0.010578 0.012247 0.022893" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "# Your code here...\n", + "knn = KNeighborsClassifier(n_neighbors=5)\n", + "X = Wine_train.drop('class', axis=1)\n", + "y = Wine_train['class']\n", + "knn.fit(X, y) \n", + "\n", + "returned_dictionary = cross_validate(estimator = knn, X = X , y = y, cv=10)\n", + "\n", + "cv_10_df = pd.DataFrame(returned_dictionary)\n", + "\n", + "cv_10_metrics = cv_10_df.agg(['mean', 'sem'])\n", + "cv_10_metrics" ] }, { @@ -308,9 +777,1037 @@ "execution_count": null, "id": "ffefa9f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_fit_timestd_fit_timemean_score_timestd_score_timeparam_n_neighborsparamssplit0_test_scoresplit1_test_scoresplit2_test_scoresplit3_test_scoresplit4_test_scoresplit5_test_scoresplit6_test_scoresplit7_test_scoresplit8_test_scoresplit9_test_scoremean_test_scorestd_test_scorerank_test_score
00.0056430.0031390.0092900.0044641{'n_neighbors': 1}0.7857140.6428570.8571430.8461540.9230770.7692310.8461540.7692310.6153850.8461540.7901100.0920331
10.0026290.0007090.0043310.0011746{'n_neighbors': 6}0.8571430.6428570.6428570.7692310.7692310.6923080.6923080.7692310.6153850.7692310.7219780.0726946
20.0063020.0040610.0094080.00525311{'n_neighbors': 11}0.7142860.7857140.5714290.6923080.7692310.6923080.7692310.7692310.6153850.7692310.7148350.0695209
30.0079430.0033500.0127880.00379316{'n_neighbors': 16}0.6428570.7142860.6428570.7692310.7692310.7692310.6923080.7692310.6923080.6923080.7153850.0487008
40.0084160.0079130.0099970.00506321{'n_neighbors': 21}0.7857140.7142860.6428570.7692310.8461540.6923080.6923080.7692310.6923080.6153850.7219780.0665365
50.0028820.0008530.0050460.00141226{'n_neighbors': 26}0.7857140.6428570.6428570.8461540.8461540.6923080.6923080.6923080.6153850.7692310.7225270.0797844
60.0032080.0011850.0081040.00446231{'n_neighbors': 31}0.7142860.7142860.7142860.7692310.8461540.6923080.6923080.6923080.6153850.7692310.7219780.0583666
70.0042080.0030490.0048510.00153136{'n_neighbors': 36}0.6428570.5714290.7142860.6923080.8461540.6923080.6923080.7692310.6923080.7692310.7082420.07106610
80.0027510.0009120.0041070.00082241{'n_neighbors': 41}0.6428570.6428570.7142860.7692310.8461540.6153850.6923080.8461540.6923080.7692310.7230770.0780992
90.0026520.0010240.0045680.00087346{'n_neighbors': 46}0.7142860.5714290.7142860.6923080.8461540.6923080.6923080.8461540.6923080.7692310.7230770.0770493
\n", + "
" + ], + "text/plain": [ + " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", + "0 0.005643 0.003139 0.009290 0.004464 \n", + "1 0.002629 0.000709 0.004331 0.001174 \n", + "2 0.006302 0.004061 0.009408 0.005253 \n", + "3 0.007943 0.003350 0.012788 0.003793 \n", + "4 0.008416 0.007913 0.009997 0.005063 \n", + "5 0.002882 0.000853 0.005046 0.001412 \n", + "6 0.003208 0.001185 0.008104 0.004462 \n", + "7 0.004208 0.003049 0.004851 0.001531 \n", + "8 0.002751 0.000912 0.004107 0.000822 \n", + "9 0.002652 0.001024 0.004568 0.000873 \n", + "\n", + " param_n_neighbors params split0_test_score \\\n", + "0 1 {'n_neighbors': 1} 0.785714 \n", + "1 6 {'n_neighbors': 6} 0.857143 \n", + "2 11 {'n_neighbors': 11} 0.714286 \n", + "3 16 {'n_neighbors': 16} 0.642857 \n", + "4 21 {'n_neighbors': 21} 0.785714 \n", + "5 26 {'n_neighbors': 26} 0.785714 \n", + "6 31 {'n_neighbors': 31} 0.714286 \n", + "7 36 {'n_neighbors': 36} 0.642857 \n", + "8 41 {'n_neighbors': 41} 0.642857 \n", + "9 46 {'n_neighbors': 46} 0.714286 \n", + "\n", + " split1_test_score split2_test_score split3_test_score split4_test_score \\\n", + "0 0.642857 0.857143 0.846154 0.923077 \n", + "1 0.642857 0.642857 0.769231 0.769231 \n", + "2 0.785714 0.571429 0.692308 0.769231 \n", + "3 0.714286 0.642857 0.769231 0.769231 \n", + "4 0.714286 0.642857 0.769231 0.846154 \n", + "5 0.642857 0.642857 0.846154 0.846154 \n", + "6 0.714286 0.714286 0.769231 0.846154 \n", + "7 0.571429 0.714286 0.692308 0.846154 \n", + "8 0.642857 0.714286 0.769231 0.846154 \n", + "9 0.571429 0.714286 0.692308 0.846154 \n", + "\n", + " split5_test_score split6_test_score split7_test_score split8_test_score \\\n", + "0 0.769231 0.846154 0.769231 0.615385 \n", + "1 0.692308 0.692308 0.769231 0.615385 \n", + "2 0.692308 0.769231 0.769231 0.615385 \n", + "3 0.769231 0.692308 0.769231 0.692308 \n", + "4 0.692308 0.692308 0.769231 0.692308 \n", + "5 0.692308 0.692308 0.692308 0.615385 \n", + "6 0.692308 0.692308 0.692308 0.615385 \n", + "7 0.692308 0.692308 0.769231 0.692308 \n", + "8 0.615385 0.692308 0.846154 0.692308 \n", + "9 0.692308 0.692308 0.846154 0.692308 \n", + "\n", + " split9_test_score mean_test_score std_test_score rank_test_score \n", + "0 0.846154 0.790110 0.092033 1 \n", + "1 0.769231 0.721978 0.072694 6 \n", + "2 0.769231 0.714835 0.069520 9 \n", + "3 0.692308 0.715385 0.048700 8 \n", + "4 0.615385 0.721978 0.066536 5 \n", + "5 0.769231 0.722527 0.079784 4 \n", + "6 0.769231 0.721978 0.058366 6 \n", + "7 0.769231 0.708242 0.071066 10 \n", + "8 0.769231 0.723077 0.078099 2 \n", + "9 0.769231 0.723077 0.077049 3 " + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here..." + "# Your code here...\n", + "parameter_grid = {'n_neighbors':range(1, 51, 5)}\n", + "wine_tune_grid = GridSearchCV(\n", + " estimator=knn,\n", + " param_grid=parameter_grid,\n", + " cv=10\n", + ")\n", + "wine_tune_grid.fit(Wine_train.drop('class', axis=1), Wine_train['class']) \n", + "accuracy_grid = pd.DataFrame(wine_tune_grid.cv_results_)\n", + "accuracy_grid \n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "ea523f6a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_neighbors': 1}" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "wine_tune_grid.best_params_ \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da77597b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier(n_neighbors=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier(n_neighbors=1)" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn = KNeighborsClassifier(n_neighbors=wine_tune_grid.best_params_['n_neighbors'])\n", + "X = Wine_train.drop('class', axis=1)\n", + "y = Wine_train['class']\n", + "knn.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac2b012c", + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The feature names should match those that were passed during fit.\nFeature names unseen at fit time:\n- predicted_class\n", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[65]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m Wine_test[\u001b[33m\"\u001b[39m\u001b[33mpredicted_class\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mknn\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mWine_test\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mclass\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2\u001b[39m Wine_test\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/DSI/LCR/lcr-env/lib/python3.11/site-packages/sklearn/neighbors/_classification.py:274\u001b[39m, in \u001b[36mKNeighborsClassifier.predict\u001b[39m\u001b[34m(self, X)\u001b[39m\n\u001b[32m 271\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.classes_[np.argmax(probabilities, axis=\u001b[32m1\u001b[39m)]\n\u001b[32m 272\u001b[39m \u001b[38;5;66;03m# In that case, we do not need the distances to perform\u001b[39;00m\n\u001b[32m 273\u001b[39m \u001b[38;5;66;03m# the weighting so we do not compute them.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m274\u001b[39m neigh_ind = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mkneighbors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_distance\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m 275\u001b[39m neigh_dist = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 276\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/DSI/LCR/lcr-env/lib/python3.11/site-packages/sklearn/neighbors/_base.py:838\u001b[39m, in \u001b[36mKNeighborsMixin.kneighbors\u001b[39m\u001b[34m(self, X, n_neighbors, return_distance)\u001b[39m\n\u001b[32m 836\u001b[39m X = _check_precomputed(X)\n\u001b[32m 837\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m838\u001b[39m X = \u001b[43mvalidate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 839\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 840\u001b[39m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 841\u001b[39m \u001b[43m \u001b[49m\u001b[43mensure_all_finite\u001b[49m\u001b[43m=\u001b[49m\u001b[43mensure_all_finite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 842\u001b[39m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcsr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 843\u001b[39m \u001b[43m \u001b[49m\u001b[43mreset\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 844\u001b[39m \u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mC\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 845\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 847\u001b[39m n_samples_fit = \u001b[38;5;28mself\u001b[39m.n_samples_fit_\n\u001b[32m 848\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m n_neighbors > n_samples_fit:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/DSI/LCR/lcr-env/lib/python3.11/site-packages/sklearn/utils/validation.py:2929\u001b[39m, in \u001b[36mvalidate_data\u001b[39m\u001b[34m(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)\u001b[39m\n\u001b[32m 2845\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mvalidate_data\u001b[39m(\n\u001b[32m 2846\u001b[39m _estimator,\n\u001b[32m 2847\u001b[39m /,\n\u001b[32m (...)\u001b[39m\u001b[32m 2853\u001b[39m **check_params,\n\u001b[32m 2854\u001b[39m ):\n\u001b[32m 2855\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Validate input data and set or check feature names and counts of the input.\u001b[39;00m\n\u001b[32m 2856\u001b[39m \n\u001b[32m 2857\u001b[39m \u001b[33;03m This helper function should be used in an estimator that requires input\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 2927\u001b[39m \u001b[33;03m validated.\u001b[39;00m\n\u001b[32m 2928\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m2929\u001b[39m \u001b[43m_check_feature_names\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_estimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreset\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2930\u001b[39m tags = get_tags(_estimator)\n\u001b[32m 2931\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m y \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m tags.target_tags.required:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/DSI/LCR/lcr-env/lib/python3.11/site-packages/sklearn/utils/validation.py:2787\u001b[39m, in \u001b[36m_check_feature_names\u001b[39m\u001b[34m(estimator, X, reset)\u001b[39m\n\u001b[32m 2784\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m missing_names \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m unexpected_names:\n\u001b[32m 2785\u001b[39m message += \u001b[33m\"\u001b[39m\u001b[33mFeature names must be in the same order as they were in fit.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m-> \u001b[39m\u001b[32m2787\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(message)\n", + "\u001b[31mValueError\u001b[39m: The feature names should match those that were passed during fit.\nFeature names unseen at fit time:\n- predicted_class\n" + ] + } + ], + "source": [ + "Wine_test[\"predicted\"] = knn.predict(Wine_test.drop('class', axis=1))\n", + "Wine_test" ] }, { @@ -365,7 +1862,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "lcr-env", "language": "python", "name": "python3" }, @@ -379,12 +1876,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.11.2" } }, "nbformat": 4,