diff --git a/python_scripts/02_numerical_pipeline_introduction.py b/python_scripts/02_numerical_pipeline_introduction.py index ca56a13fb..3515219b5 100644 --- a/python_scripts/02_numerical_pipeline_introduction.py +++ b/python_scripts/02_numerical_pipeline_introduction.py @@ -59,7 +59,7 @@ data.head() # %% [markdown] -# We can now linger on the variables, also denominated features, that we later +# We can now focus on the variables, also denominated features, that we later # use to build our predictive model. In addition, we can also check how many # samples are available in our dataset. diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py index 62cd9be98..e42bd06df 100644 --- a/python_scripts/03_categorical_pipeline.py +++ b/python_scripts/03_categorical_pipeline.py @@ -253,7 +253,7 @@ # and check the generalization performance of this machine learning pipeline using # cross-validation. # -# Before we create the pipeline, we have to linger on the `native-country`. +# Before we create the pipeline, we have to focus on the `native-country`. # Let's recall some statistics regarding this column. # %% @@ -329,9 +329,10 @@ print(f"The accuracy is: {scores.mean():.3f} ± {scores.std():.3f}") # %% [markdown] -# As you can see, this representation of the categorical variables is -# slightly more predictive of the revenue than the numerical variables -# that we used previously. +# As you can see, this representation of the categorical variables is slightly +# more predictive of the revenue than the numerical variables that we used +# previously. The reason being that we have more (predictive) categorical +# features than numerical ones. # %% [markdown] # diff --git a/python_scripts/cross_validation_train_test.py b/python_scripts/cross_validation_train_test.py index f249a91fb..5f3027507 100644 --- a/python_scripts/cross_validation_train_test.py +++ b/python_scripts/cross_validation_train_test.py @@ -12,7 +12,7 @@ # of predictive models. While this section could be slightly redundant, we # intend to go into details into the cross-validation framework. # -# Before we dive in, let's linger on the reasons for always having training and +# Before we dive in, let's focus on the reasons for always having training and # testing sets. Let's first look at the limitation of using a dataset without # keeping any samples out. # diff --git a/python_scripts/ensemble_sol_02.py b/python_scripts/ensemble_sol_02.py index 232ec2c04..061be3e52 100644 --- a/python_scripts/ensemble_sol_02.py +++ b/python_scripts/ensemble_sol_02.py @@ -103,3 +103,10 @@ plt.plot(data_range[feature_name], forest_predictions, label="Random forest") _ = plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") + +# %% [markdown] tags=["solution"] +# The random forest reduces the overfitting of the individual trees but still +# overfits itself. In the section on "hyperparameter tuning with ensemble +# methods" we will see how to further mitigate this effect. Still, interested +# users may increase the number of estimators in the forest and try different +# values of, e.g., `min_samples_split`. diff --git a/python_scripts/linear_models_ex_04.py b/python_scripts/linear_models_ex_04.py index dd9ae6bb1..473013074 100644 --- a/python_scripts/linear_models_ex_04.py +++ b/python_scripts/linear_models_ex_04.py @@ -17,7 +17,7 @@ # In the previous Module we tuned the hyperparameter `C` of the logistic # regression without mentioning that it controls the regularization strength. # Later, on the slides on 🎥 **Intuitions on regularized linear models** we -# metioned that a small `C` provides a more regularized model, whereas a +# mentioned that a small `C` provides a more regularized model, whereas a # non-regularized model is obtained with an infinitely large value of `C`. # Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge` # model. diff --git a/python_scripts/linear_models_sol_04.py b/python_scripts/linear_models_sol_04.py index 942aed56d..623afad8e 100644 --- a/python_scripts/linear_models_sol_04.py +++ b/python_scripts/linear_models_sol_04.py @@ -11,7 +11,7 @@ # In the previous Module we tuned the hyperparameter `C` of the logistic # regression without mentioning that it controls the regularization strength. # Later, on the slides on 🎥 **Intuitions on regularized linear models** we -# metioned that a small `C` provides a more regularized model, whereas a +# mentioned that a small `C` provides a more regularized model, whereas a # non-regularized model is obtained with an infinitely large value of `C`. # Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge` # model. diff --git a/python_scripts/metrics_regression.py b/python_scripts/metrics_regression.py index 494447732..4c6d91db7 100644 --- a/python_scripts/metrics_regression.py +++ b/python_scripts/metrics_regression.py @@ -97,8 +97,9 @@ # %% [markdown] # The $R^2$ score represents the proportion of variance of the target that is # explained by the independent variables in the model. The best score possible -# is 1 but there is no lower bound. However, a model that predicts the expected -# value of the target would get a score of 0. +# is 1 but there is no lower bound. However, a model that predicts the [expected +# value](https://en.wikipedia.org/wiki/Expected_value) of the target would get a +# score of 0. # %% from sklearn.dummy import DummyRegressor