diff --git a/Notebooks/04_preprocessing_and_training.ipynb b/Notebooks/04_preprocessing_and_training.ipynb index 94ff2aeba..f7464f72e 100644 --- a/Notebooks/04_preprocessing_and_training.ipynb +++ b/Notebooks/04_preprocessing_and_training.ipynb @@ -986,10 +986,10 @@ "#Save the 'Name', 'state', and 'Region' columns from the train/test data into names_train and names_test\n", "#Then drop those columns from `X_train` and `X_test`. Use 'inplace=True'\n", "names_list = ['Name', 'state', 'Region']\n", - "names_train = X_train[___]\n", - "names_test = X_test[___]\n", - "X_train.___(columns=names_list, inplace=___)\n", - "X_test.___(columns=names_list, inplace=___)\n", + "names_train = X_train[names_list]\n", + "names_test = X_test[names_list]\n", + "X_train.drop(columns=names_list, inplace=True)\n", + "X_test.drop(columns=names_list, inplace=True)\n", "X_train.shape, X_test.shape" ] }, @@ -1001,7 +1001,7 @@ "source": [ "#Code task 2#\n", "#Check the `dtypes` attribute of `X_train` to verify all features are numeric\n", - "X_train.___" + "X_train.dtypes" ] }, { @@ -1012,7 +1012,7 @@ "source": [ "#Code task 3#\n", "#Repeat this check for the test split in `X_test`\n", - "X_test.___" + "X_test.dtypes" ] }, { @@ -1044,7 +1044,7 @@ "source": [ "#Code task 4#\n", "#Calculate the mean of `y_train`\n", - "train_mean = y_train.___\n", + "train_mean = y_train.mean()\n", "train_mean" ] }, @@ -1066,8 +1066,8 @@ "#Hint, call its `.fit()` method with `X_train` and `y_train` as arguments\n", "#Then print the object's `constant_` attribute and verify it's the same as the mean above\n", "dumb_reg = DummyRegressor(strategy='mean')\n", - "dumb_reg.___(___, ___)\n", - "dumb_reg.___" + "dumb_reg.fit('X_train', `y_train`)\n", + "dumb_reg.constant_" ] }, { @@ -1140,9 +1140,9 @@ " ypred -- the predicted values\n", " \"\"\"\n", " ybar = np.sum(y) / len(y) #yes, we could use np.mean(y)\n", - " sum_sq_tot = np.___((y - ybar)**2) #total sum of squares error\n", - " sum_sq_res = np.___((y - ypred)**2) #residual sum of squares error\n", - " R2 = 1.0 - ___ / ___\n", + " sum_sq_tot = np.mean((y - ybar)**2) #total sum of squares error\n", + " sum_sq_res = np.mean((y - ypred)**2) #residual sum of squares error\n", + " R2 = 1.0 - sum_sq_tot / sum_sq_res\n", " return R2" ] }, @@ -1398,8 +1398,8 @@ " y -- the observed values\n", " ypred -- the predicted values\n", " \"\"\"\n", - " sq_error = (___ - ___)**2\n", - " mse = np.mean(___)\n", + " sq_error = (y_true - y_pred)**2\n", + " mse = np.mean(sq_error)\n", " return mse" ] }, @@ -1805,8 +1805,8 @@ "#Code task 9#\n", "#Call `X_train` and `X_test`'s `fillna()` method, passing `X_defaults_median` as the values to use\n", "#Assign the results to `X_tr` and `X_te`, respectively\n", - "X_tr = X_train.___(___)\n", - "X_te = X_test.___(___)" + "X_tr = X_train.fillna(X_defaults_median)\n", + "X_te = X_test.fillna(X_defaults_median)" ] }, { @@ -1834,9 +1834,9 @@ "#then use it's `transform()` method to apply the scaling to both the train and test split\n", "#data (`X_tr` and `X_te`), naming the results `X_tr_scaled` and `X_te_scaled`, respectively\n", "scaler = StandardScaler()\n", - "scaler.___(X_tr)\n", - "X_tr_scaled = scaler.___(X_tr)\n", - "X_te_scaled = scaler.___(X_te)" + "scaler.fit(X_tr)\n", + "X_tr_scaled = scaler.transform(X_tr)\n", + "X_te_scaled = scaler.transform(X_te)" ] }, { @@ -1871,8 +1871,8 @@ "#Code task 11#\n", "#Call the `predict()` method of the model (`lm`) on both the (scaled) train and test data\n", "#Assign the predictions to `y_tr_pred` and `y_te_pred`, respectively\n", - "y_tr_pred = lm.___(X_tr_scaled)\n", - "y_te_pred = lm.___(X_te_scaled)" + "y_tr_pred = lm.predict(X_tr_scaled)\n", + "y_te_pred = lm.predict(X_te_scaled)" ] }, { @@ -1921,7 +1921,7 @@ "#Now calculate the mean absolute error scores using `sklearn`'s `mean_absolute_error` function\n", "# as we did above for R^2\n", "# MAE - train, test\n", - "median_mae = ___(y_train, y_tr_pred), ___(y_test, y_te_pred)\n", + "median_mae = mae_score(y_train, y_tr_pred), mae_score(y_test, y_te_pred)\n", "median_mae" ] }, @@ -1941,7 +1941,7 @@ "#Code task 13#\n", "#And also do the same using `sklearn`'s `mean_squared_error`\n", "# MSE - train, test\n", - "median_mse = ___(___, ___), ___(___, ___)\n", + "median_mse = mse_score(y_train, y_tr_pred), mse_score(y_test, y_te_pred)\n", "median_mse" ] }, @@ -1975,7 +1975,7 @@ "#Code task 14#\n", "#As we did for the median above, calculate mean values for imputing missing values\n", "# These are the values we'll use to fill in any missing values\n", - "X_defaults_mean = X_train.___()\n", + "X_defaults_mean = X_train.mean()\n", "X_defaults_mean" ] }, @@ -2241,7 +2241,7 @@ "source": [ "#Code task 15#\n", "#Call the pipe's `fit()` method with `X_train` and `y_train` as arguments\n", - "pipe.___(___, ___)" + "pipe.fit(X_train, y_train)" ] }, { @@ -2459,7 +2459,7 @@ "pipe = make_pipeline(\n", " SimpleImputer(strategy='median'), \n", " StandardScaler(),\n", - " ___(___),\n", + " f_regression(SelectKBest),\n", " LinearRegression()\n", ")" ] @@ -2577,7 +2577,7 @@ "pipe15 = make_pipeline(\n", " SimpleImputer(strategy='median'), \n", " StandardScaler(),\n", - " ___(___, k=___),\n", + " f_regression(SelectKBest, k=15),\n", " LinearRegression()\n", ")" ] @@ -2804,7 +2804,7 @@ "#Code task 18#\n", "#Call `pipe`'s `get_params()` method to get a dict of available parameters and print their names\n", "#using dict's `keys()` method\n", - "pipe.___.keys()" + "pipe.get_params().keys()" ] }, { @@ -2892,7 +2892,7 @@ "source": [ "#Code task 19#\n", "#Print the `best_params_` attribute of `lr_grid_cv`\n", - "lr_grid_cv.___" + "lr_grid_cv.best_params_" ] }, { @@ -2903,7 +2903,7 @@ "source": [ "#Code task 20#\n", "#Assign the value of k from the above dict of `best_params_` and assign it to `best_k`\n", - "___ = lr_grid_cv.___['selectkbest__k']\n", + "best_k = lr_grid_cv.best_params_['selectkbest__k']\n", "plt.subplots(figsize=(10, 5))\n", "plt.errorbar(cv_k, score_mean, yerr=score_std)\n", "plt.axvline(x=best_k, c='r', ls='--', alpha=.5)\n", @@ -2955,7 +2955,7 @@ "#sorting the values in descending order\n", "coefs = lr_grid_cv.best_estimator_.named_steps.linearregression.coef_\n", "features = X_train.columns[selected]\n", - "pd.Series(___, index=___).___(ascending=___)" + "pd.Series(coefs, index=features).sort(ascending=False)" ] }, { @@ -3000,9 +3000,9 @@ "#StandardScaler(),\n", "#and then RandomForestRegressor() with a random state of 47\n", "RF_pipe = make_pipeline(\n", - " ___(strategy=___),\n", - " ___,\n", - " ___(random_state=___)\n", + " SimpleImputer(strategy=median),\n", + " StandardScaler(),\n", + " RandomForestRegressor(random_state=47)\n", ")" ] }, @@ -3023,7 +3023,7 @@ "#Call `cross_validate` to estimate the pipeline's performance.\n", "#Pass it the random forest pipe object, `X_train` and `y_train`,\n", "#and get it to use 5-fold cross-validation\n", - "rf_default_cv_results = cross_validate(___, ___, ___, cv=___)" + "rf_default_cv_results = cross_validate(RF_pipe, X_train, y_train, cv=5-fold)" ] }, { @@ -3137,7 +3137,7 @@ "#Code task 24#\n", "#Call `GridSearchCV` with the random forest pipeline, passing in the above `grid_params`\n", "#dict for parameters to evaluate, 5-fold cross-validation, and all available CPU cores (if desired)\n", - "rf_grid_cv = GridSearchCV(___, param_grid=___, cv=___, n_jobs=-1)" + "rf_grid_cv = GridSearchCV(RF_pipe, param_grid=grid_params, cv=5-fold, n_jobs=-1)" ] }, { @@ -3149,7 +3149,7 @@ "#Code task 25#\n", "#Now call the `GridSearchCV`'s `fit()` method with `X_train` and `y_train` as arguments\n", "#to actually start the grid search. This may take a minute or two.\n", - "rf_grid_cv.___(___, ___)" + "rf_grid_cv.fit(X_train, y_train)" ] }, { @@ -3160,7 +3160,7 @@ "source": [ "#Code task 26#\n", "#Print the best params (`best_params_` attribute) from the grid search\n", - "rf_grid_cv.___" + "rf_grid_cv.best_params_" ] }, { @@ -3233,7 +3233,7 @@ "#training data column names, sorting the values in descending order\n", "plt.subplots(figsize=(10, 5))\n", "imps = rf_grid_cv.best_estimator_.named_steps.randomforestregressor.___\n", - "rf_feat_imps = pd.Series(___, index=X_train.columns).sort_values(ascending=False)\n", + "rf_feat_imps = pd.Series(data=training, index=X_train.columns).sort_values(ascending=False)\n", "rf_feat_imps.plot(kind='bar')\n", "plt.xlabel('features')\n", "plt.ylabel('importance')\n", @@ -3492,12 +3492,12 @@ "#and the current datetime (`datetime.datetime.now()`) to the `build_datetime` attribute\n", "#Let's call this model version '1.0'\n", "best_model = rf_grid_cv.best_estimator_\n", - "best_model.version = ___\n", - "best_model.pandas_version = ___\n", - "best_model.numpy_version = ___\n", - "best_model.sklearn_version = ___\n", + "best_model.version = 1.0\n", + "best_model.pandas_version = pd.__version__\n", + "best_model.numpy_version = np.__version__\n", + "best_model.sklearn_version = sklearn_version\n", "best_model.X_columns = [col for col in X_train.columns]\n", - "best_model.build_datetime = ___" + "best_model.build_datetime = datetime.datetime.now()" ] }, { @@ -3530,7 +3530,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**A: 1** Your answer here" + "**A: 1** A baseline idea of performance was gained by simply taking the average ticket price, however, that prediction was found to be within $19 of the real ticket price. To get even closer to the real ticket price, a linear regression model was used and that model explains over 80% of the variance on the train set as well as over 70% on the test set. Using this model, on average, you'd expect to estimate a ticket price within approximately $9 of the real price. Testing its performance using the test/split method, as expected, did not hold up consistently. The next model used is the random forest model. This model has an even lower cross-validation estimate, to the real price, by almost $1. This model also testing consistent estimates with the various performance results. With all of this data, I’ve chosen to use the random forest model. This decision was made based off the consistency of the models results, and the ability to use this estimate on various areas of data for additional proactive solutions or predictions for conflict resolution. ] } ],