diff --git a/python_scripts/02_numerical_pipeline_introduction.py b/python_scripts/02_numerical_pipeline_introduction.py
index ca56a13fb..3515219b5 100644
--- a/python_scripts/02_numerical_pipeline_introduction.py
+++ b/python_scripts/02_numerical_pipeline_introduction.py
@@ -59,7 +59,7 @@
 data.head()
 
 # %% [markdown]
-# We can now linger on the variables, also denominated features, that we later
+# We can now focus on the variables, also denominated features, that we later
 # use to build our predictive model. In addition, we can also check how many
 # samples are available in our dataset.
 
diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py
index 62cd9be98..e42bd06df 100644
--- a/python_scripts/03_categorical_pipeline.py
+++ b/python_scripts/03_categorical_pipeline.py
@@ -253,7 +253,7 @@
 # and check the generalization performance of this machine learning pipeline using
 # cross-validation.
 #
-# Before we create the pipeline, we have to linger on the `native-country`.
+# Before we create the pipeline, we have to focus on the `native-country`.
 # Let's recall some statistics regarding this column.
 
 # %%
@@ -329,9 +329,10 @@
 print(f"The accuracy is: {scores.mean():.3f} ± {scores.std():.3f}")
 
 # %% [markdown]
-# As you can see, this representation of the categorical variables is
-# slightly more predictive of the revenue than the numerical variables
-# that we used previously.
+# As you can see, this representation of the categorical variables is slightly
+# more predictive of the revenue than the numerical variables that we used
+# previously. The reason being that we have more (predictive) categorical
+# features than numerical ones.
 
 # %% [markdown]
 #
diff --git a/python_scripts/cross_validation_train_test.py b/python_scripts/cross_validation_train_test.py
index f249a91fb..5f3027507 100644
--- a/python_scripts/cross_validation_train_test.py
+++ b/python_scripts/cross_validation_train_test.py
@@ -12,7 +12,7 @@
 # of predictive models. While this section could be slightly redundant, we
 # intend to go into details into the cross-validation framework.
 #
-# Before we dive in, let's linger on the reasons for always having training and
+# Before we dive in, let's focus on the reasons for always having training and
 # testing sets. Let's first look at the limitation of using a dataset without
 # keeping any samples out.
 #
diff --git a/python_scripts/ensemble_sol_02.py b/python_scripts/ensemble_sol_02.py
index 232ec2c04..061be3e52 100644
--- a/python_scripts/ensemble_sol_02.py
+++ b/python_scripts/ensemble_sol_02.py
@@ -103,3 +103,10 @@
 
 plt.plot(data_range[feature_name], forest_predictions, label="Random forest")
 _ = plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left")
+
+# %% [markdown] tags=["solution"]
+# The random forest reduces the overfitting of the individual trees but still
+# overfits itself. In the section on "hyperparameter tuning with ensemble
+# methods" we will see how to further mitigate this effect. Still, interested
+# users may increase the number of estimators in the forest and try different
+# values of, e.g., `min_samples_split`.
diff --git a/python_scripts/linear_models_ex_04.py b/python_scripts/linear_models_ex_04.py
index dd9ae6bb1..473013074 100644
--- a/python_scripts/linear_models_ex_04.py
+++ b/python_scripts/linear_models_ex_04.py
@@ -17,7 +17,7 @@
 # In the previous Module we tuned the hyperparameter `C` of the logistic
 # regression without mentioning that it controls the regularization strength.
 # Later, on the slides on 🎥 **Intuitions on regularized linear models** we
-# metioned that a small `C` provides a more regularized model, whereas a
+# mentioned that a small `C` provides a more regularized model, whereas a
 # non-regularized model is obtained with an infinitely large value of `C`.
 # Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge`
 # model.
diff --git a/python_scripts/linear_models_sol_04.py b/python_scripts/linear_models_sol_04.py
index 942aed56d..623afad8e 100644
--- a/python_scripts/linear_models_sol_04.py
+++ b/python_scripts/linear_models_sol_04.py
@@ -11,7 +11,7 @@
 # In the previous Module we tuned the hyperparameter `C` of the logistic
 # regression without mentioning that it controls the regularization strength.
 # Later, on the slides on 🎥 **Intuitions on regularized linear models** we
-# metioned that a small `C` provides a more regularized model, whereas a
+# mentioned that a small `C` provides a more regularized model, whereas a
 # non-regularized model is obtained with an infinitely large value of `C`.
 # Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge`
 # model.
diff --git a/python_scripts/metrics_regression.py b/python_scripts/metrics_regression.py
index 494447732..4c6d91db7 100644
--- a/python_scripts/metrics_regression.py
+++ b/python_scripts/metrics_regression.py
@@ -97,8 +97,9 @@
 # %% [markdown]
 # The $R^2$ score represents the proportion of variance of the target that is
 # explained by the independent variables in the model. The best score possible
-# is 1 but there is no lower bound. However, a model that predicts the expected
-# value of the target would get a score of 0.
+# is 1 but there is no lower bound. However, a model that predicts the [expected
+# value](https://en.wikipedia.org/wiki/Expected_value) of the target would get a
+# score of 0.
 
 # %%
 from sklearn.dummy import DummyRegressor