diff --git a/python_scripts/parameter_tuning_ex_02.py b/python_scripts/parameter_tuning_ex_02.py
index cd0e5e3f0..ff119eb58 100644
--- a/python_scripts/parameter_tuning_ex_02.py
+++ b/python_scripts/parameter_tuning_ex_02.py
@@ -68,10 +68,10 @@
 # %% [markdown]
 # Use the previously defined model (called `model`) and using two nested `for`
 # loops, make a search of the best combinations of the `learning_rate` and
-# `max_leaf_nodes` parameters. In this regard, you will need to train and test
-# the model by setting the parameters. The evaluation of the model should be
-# performed using `cross_val_score` on the training set. We will use the
-# following parameters search:
+# `max_leaf_nodes` parameters. In this regard, you have to train and test the
+# model by setting the parameters. The evaluation of the model should be
+# performed using `cross_val_score` on the training set. Use the following
+# parameters search:
 # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls
 #   the ability of a new tree to correct the error of the previous sequence of
 #   trees
diff --git a/python_scripts/parameter_tuning_ex_03.py b/python_scripts/parameter_tuning_ex_03.py
index 48c9a5c41..85dfda6db 100644
--- a/python_scripts/parameter_tuning_ex_03.py
+++ b/python_scripts/parameter_tuning_ex_03.py
@@ -29,7 +29,7 @@
 )
 
 # %% [markdown]
-# In this exercise, we will progressively define the regression pipeline and
+# In this exercise, we progressively define the regression pipeline and
 # later tune its hyperparameters.
 #
 # Start by defining a pipeline that:
diff --git a/python_scripts/parameter_tuning_grid_search.py b/python_scripts/parameter_tuning_grid_search.py
index 5219d0b51..12bbffb57 100644
--- a/python_scripts/parameter_tuning_grid_search.py
+++ b/python_scripts/parameter_tuning_grid_search.py
@@ -9,7 +9,7 @@
 # # Hyperparameter tuning by grid-search
 #
 # In the previous notebook, we saw that hyperparameters can affect the
-# generalization performance of a model. In this notebook, we will show how to
+# generalization performance of a model. In this notebook, we show how to
 # optimize hyperparameters using a grid-search approach.
 
 # %% [markdown]
@@ -49,8 +49,8 @@
 )
 
 # %% [markdown]
-# We will define a pipeline as seen in the first module. It will handle both
-# numerical and categorical features.
+# We define a pipeline as seen in the first module, to handle both numerical and
+# categorical features.
 #
 # The first step is to select all the categorical columns.
 
@@ -61,7 +61,7 @@
 categorical_columns = categorical_columns_selector(data)
 
 # %% [markdown]
-# Here we will use a tree-based model as a classifier (i.e.
+# Here we use a tree-based model as a classifier (i.e.
 # `HistGradientBoostingClassifier`). That means:
 #
 # * Numerical variables don't need scaling;
@@ -119,8 +119,8 @@
 # code.
 #
 # Let's see how to use the `GridSearchCV` estimator for doing such search. Since
-# the grid-search will be costly, we will only explore the combination
-# learning-rate and the maximum number of nodes.
+# the grid-search is costly, we only explore the combination learning-rate and
+# the maximum number of nodes.
 
 # %%
 # %%time
@@ -134,7 +134,7 @@
 model_grid_search.fit(data_train, target_train)
 
 # %% [markdown]
-# Finally, we will check the accuracy of our model using the test set.
+# Finally, we check the accuracy of our model using the test set.
 
 # %%
 accuracy = model_grid_search.score(data_test, target_test)
@@ -155,17 +155,17 @@
 
 # %% [markdown]
 # The `GridSearchCV` estimator takes a `param_grid` parameter which defines all
-# hyperparameters and their associated values. The grid-search will be in charge
+# hyperparameters and their associated values. The grid-search is in charge
 # of creating all possible combinations and test them.
 #
-# The number of combinations will be equal to the product of the number of
-# values to explore for each parameter (e.g. in our example 4 x 3 combinations).
-# Thus, adding new parameters with their associated values to be explored become
+# The number of combinations are equal to the product of the number of values to
+# explore for each parameter (e.g. in our example 4 x 3 combinations). Thus,
+# adding new parameters with their associated values to be explored become
 # rapidly computationally expensive.
 #
 # Once the grid-search is fitted, it can be used as any other predictor by
-# calling `predict` and `predict_proba`. Internally, it will use the model with
-# the best parameters found during `fit`.
+# calling `predict` and `predict_proba`. Internally, it uses the model with the
+# best parameters found during `fit`.
 #
 # Get predictions for the 5 first samples using the estimator with the best
 # parameters.
@@ -186,8 +186,8 @@
 # parameters "by hand" through a double for loop.
 #
 # In addition, we can inspect all results which are stored in the attribute
-# `cv_results_` of the grid-search. We will filter some specific columns from
-# these results.
+# `cv_results_` of the grid-search. We filter some specific columns from these
+# results.
 
 # %%
 cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
@@ -220,9 +220,9 @@ def shorten_param(param_name):
 # With only 2 parameters, we might want to visualize the grid-search as a
 # heatmap. We need to transform our `cv_results` into a dataframe where:
 #
-# - the rows will correspond to the learning-rate values;
-# - the columns will correspond to the maximum number of leaf;
-# - the content of the dataframe will be the mean test scores.
+# - the rows correspond to the learning-rate values;
+# - the columns correspond to the maximum number of leaf;
+# - the content of the dataframe is the mean test scores.
 
 # %%
 pivoted_cv_results = cv_results.pivot_table(
@@ -259,7 +259,7 @@ def shorten_param(param_name):
 #
 # The precise meaning of those two parameters will be explained later.
 #
-# For now we will note that, in general, **there is no unique optimal parameter
+# For now we note that, in general, **there is no unique optimal parameter
 # setting**: 4 models out of the 12 parameter configurations reach the maximal
 # accuracy (up to small random fluctuations caused by the sampling of the
 # training set).
diff --git a/python_scripts/parameter_tuning_nested.py b/python_scripts/parameter_tuning_nested.py
index fb7406274..e4ccc7102 100644
--- a/python_scripts/parameter_tuning_nested.py
+++ b/python_scripts/parameter_tuning_nested.py
@@ -12,12 +12,12 @@
 # However, we did not present a proper framework to evaluate the tuned models.
 # Instead, we focused on the mechanism used to find the best set of parameters.
 #
-# In this notebook, we will reuse some knowledge presented in the module
-# "Selecting the best model" to show how to evaluate models where
-# hyperparameters need to be tuned.
+# In this notebook, we reuse some knowledge presented in the module "Selecting
+# the best model" to show how to evaluate models where hyperparameters need to
+# be tuned.
 #
-# Thus, we will first load the dataset and create the predictive model that we
-# want to optimize and later on, evaluate.
+# Thus, we first load the dataset and create the predictive model that we want
+# to optimize and later on, evaluate.
 #
 # ## Loading the dataset
 #
@@ -111,7 +111,7 @@
 # ### With hyperparameter tuning
 #
 # As shown in the previous notebook, one can use a search strategy that uses
-# cross-validation to find the best set of parameters. Here, we will use a
+# cross-validation to find the best set of parameters. Here, we use a
 # grid-search strategy and reproduce the steps done in the previous notebook.
 #
 # First, we have to embed our model into a grid-search and specify the
diff --git a/python_scripts/parameter_tuning_parallel_plot.py b/python_scripts/parameter_tuning_parallel_plot.py
index 304585cb0..340e75dd0 100644
--- a/python_scripts/parameter_tuning_parallel_plot.py
+++ b/python_scripts/parameter_tuning_parallel_plot.py
@@ -110,8 +110,8 @@ def shorten_param(param_name):
 # spread the active ranges and improve the readability of the plot.
 # ```
 #
-# The parallel coordinates plot will display the values of the hyperparameters
-# on different columns while the performance metric is color coded. Thus, we are
+# The parallel coordinates plot displays the values of the hyperparameters on
+# different columns while the performance metric is color coded. Thus, we are
 # able to quickly inspect if there is a range of hyperparameters which is
 # working or not.
 #
diff --git a/python_scripts/parameter_tuning_sol_02.py b/python_scripts/parameter_tuning_sol_02.py
index 1ea4cf572..9c5ceaa2c 100644
--- a/python_scripts/parameter_tuning_sol_02.py
+++ b/python_scripts/parameter_tuning_sol_02.py
@@ -62,10 +62,10 @@
 # %% [markdown]
 # Use the previously defined model (called `model`) and using two nested `for`
 # loops, make a search of the best combinations of the `learning_rate` and
-# `max_leaf_nodes` parameters. In this regard, you will need to train and test
-# the model by setting the parameters. The evaluation of the model should be
-# performed using `cross_val_score` on the training set. We will use the
-# following parameters search:
+# `max_leaf_nodes` parameters. In this regard, you need to train and test the
+# model by setting the parameters. The evaluation of the model should be
+# performed using `cross_val_score` on the training set. Use the following
+# parameters search:
 # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls
 #   the ability of a new tree to correct the error of the previous sequence of
 #   trees
diff --git a/python_scripts/parameter_tuning_sol_03.py b/python_scripts/parameter_tuning_sol_03.py
index 149cc0de1..55773ebd3 100644
--- a/python_scripts/parameter_tuning_sol_03.py
+++ b/python_scripts/parameter_tuning_sol_03.py
@@ -23,8 +23,8 @@
 )
 
 # %% [markdown]
-# In this exercise, we will progressively define the regression pipeline and
-# later tune its hyperparameters.
+# In this exercise, we progressively define the regression pipeline and later
+# tune its hyperparameters.
 #
 # Start by defining a pipeline that:
 # * uses a `StandardScaler` to normalize the numerical data;
@@ -108,8 +108,8 @@
 cv_results = pd.DataFrame(model_random_search.cv_results_)
 
 # %% [markdown] tags=["solution"]
-# To simplify the axis of the plot, we will rename the column of the dataframe
-# and only select the mean test score and the value of the hyperparameters.
+# To simplify the axis of the plot, we rename the column of the dataframe and
+# only select the mean test score and the value of the hyperparameters.
 
 # %% tags=["solution"]
 column_name_mapping = {
@@ -170,7 +170,7 @@
 # vary between 0 and 10,000 (e.g. the variable `"Population"`) and B is a
 # feature that varies between 1 and 10 (e.g. the variable `"AveRooms"`), then
 # distances between samples (rows of the dataframe) are mostly impacted by
-# differences in values of the column A, while values of the column B will be
+# differences in values of the column A, while values of the column B are
 # comparatively ignored. If one applies StandardScaler to such a database, both
 # the values of A and B will be approximately between -3 and 3 and the neighbor
 # structure will be impacted more or less equivalently by both variables.
diff --git a/python_scripts/trees_dataset.py b/python_scripts/trees_dataset.py
index 708c61b29..888eee5a7 100644
--- a/python_scripts/trees_dataset.py
+++ b/python_scripts/trees_dataset.py
@@ -15,7 +15,7 @@
 #
 # ## Classification dataset
 #
-# We will use this dataset in classification setting to predict the penguins'
+# We use this dataset in classification setting to predict the penguins'
 # species from anatomical information.
 #
 # Each penguin is from one of the three following species: Adelie, Gentoo, and
@@ -26,15 +26,15 @@
 # penguins](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/lter_penguins.png)
 #
 # This problem is a classification problem since the target is categorical. We
-# will limit our input data to a subset of the original features to simplify our
-# explanations when presenting the decision tree algorithm. Indeed, we will use
+# limit our input data to a subset of the original features to simplify our
+# explanations when presenting the decision tree algorithm. Indeed, we use
 # features based on penguins' culmen measurement. You can learn more about the
 # penguins' culmen with the illustration below:
 #
 # ![Image of
 # culmen](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/culmen_depth.png)
 #
-# We will start by loading this subset of the dataset.
+# We start by loading this subset of the dataset.
 
 # %%
 import pandas as pd
@@ -73,11 +73,11 @@
 #
 # In a regression setting, the target is a continuous variable instead of
 # categories. Here, we use two features of the dataset to make such a problem:
-# the flipper length will be used as data and the body mass will be the target.
-# In short, we want to predict the body mass using the flipper length.
+# the flipper length is used as data and the body mass as the target. In short,
+# we want to predict the body mass using the flipper length.
 #
-# We will load the dataset and visualize the relationship between the flipper
-# length and the body mass of penguins.
+# We load the dataset and visualize the relationship between the flipper length
+# and the body mass of penguins.
 
 # %%
 penguins = pd.read_csv("../datasets/penguins_regression.csv")
diff --git a/python_scripts/trees_ex_02.py b/python_scripts/trees_ex_02.py
index 6c7d3b1b1..f53fb7566 100644
--- a/python_scripts/trees_ex_02.py
+++ b/python_scripts/trees_ex_02.py
@@ -20,7 +20,7 @@
 # By extrapolation, we refer to values predicted by a model outside of the range
 # of feature values seen during the training.
 #
-# We will first load the regression data.
+# We first load the regression data.
 
 # %%
 import pandas as pd
@@ -61,10 +61,10 @@
 # Write your code here.
 
 # %% [markdown]
-# Now, we will check the extrapolation capabilities of each model. Create a
-# dataset containing a broader range of values than your previous dataset, in
-# other words, add values below and above the minimum and the maximum of the
-# flipper length seen during training.
+# Now, we check the extrapolation capabilities of each model. Create a dataset
+# containing a broader range of values than your previous dataset, in other
+# words, add values below and above the minimum and the maximum of the flipper
+# length seen during training.
 
 # %%
 # Write your code here.
diff --git a/python_scripts/trees_hyperparameters.py b/python_scripts/trees_hyperparameters.py
index dc8594968..218e9b38b 100644
--- a/python_scripts/trees_hyperparameters.py
+++ b/python_scripts/trees_hyperparameters.py
@@ -8,11 +8,11 @@
 # %% [markdown]
 # # Importance of decision tree hyperparameters on generalization
 #
-# In this notebook, we will illustrate the importance of some key
-# hyperparameters on the decision tree; we will demonstrate their effects on the
-# classification and regression problems we saw previously.
+# In this notebook, we illustrate the importance of some key hyperparameters on
+# the decision tree; we demonstrate their effects on the classification and
+# regression problems we saw previously.
 #
-# First, we will load the classification and regression datasets.
+# First, we load the classification and regression datasets.
 
 # %%
 import pandas as pd
@@ -35,7 +35,7 @@
 # %% [markdown]
 # ## Create helper functions
 #
-# We will create some helper functions to plot the data samples as well as the
+# We create some helper functions to plot the data samples as well as the
 # decision boundary for classification and the regression line for regression.
 
 # %%
@@ -135,10 +135,10 @@ def fit_and_plot_regression(model, data, feature_names, target_names):
 
 # %% [markdown]
 # For both classification and regression setting, we observe that increasing the
-# depth will make the tree model more expressive. However, a tree that is too
-# deep will overfit the training data, creating partitions which are only
-# correct for "outliers" (noisy samples). The `max_depth` is one of the
-# hyperparameters that one should optimize via cross-validation and grid-search.
+# depth makes the tree model more expressive. However, a tree that is too deep
+# may overfit the training data, creating partitions which are only correct for
+# "outliers" (noisy samples). The `max_depth` is one of the hyperparameters that
+# one should optimize via cross-validation and grid-search.
 
 # %%
 from sklearn.model_selection import GridSearchCV
@@ -172,15 +172,15 @@ def fit_and_plot_regression(model, data, feature_names, target_names):
 #
 # The `max_depth` hyperparameter controls the overall complexity of the tree.
 # This parameter is adequate under the assumption that a tree is built
-# symmetrically. However, there is no guarantee that a tree will be symmetrical.
+# symmetrically. However, there is no reason why a tree should be symmetrical.
 # Indeed, optimal generalization performance could be reached by growing some of
 # the branches deeper than some others.
 #
-# We will build a dataset where we will illustrate this asymmetry. We will
-# generate a dataset composed of 2 subsets: one subset where a clear separation
-# should be found by the tree and another subset where samples from both classes
-# will be mixed. It implies that a decision tree will need more splits to
-# classify properly samples from the second subset than from the first subset.
+# We build a dataset where we illustrate this asymmetry. We generate a dataset
+# composed of 2 subsets: one subset where a clear separation should be found by
+# the tree and another subset where samples from both classes are mixed. It
+# implies that a decision tree needs more splits to classify properly samples
+# from the second subset than from the first subset.
 
 # %%
 from sklearn.datasets import make_blobs
@@ -188,11 +188,11 @@ def fit_and_plot_regression(model, data, feature_names, target_names):
 data_clf_columns = ["Feature #0", "Feature #1"]
 target_clf_column = "Class"
 
-# Blobs that will be interlaced
+# Blobs that are interlaced
 X_1, y_1 = make_blobs(
     n_samples=300, centers=[[0, 0], [-1, -1]], random_state=0
 )
-# Blobs that will be easily separated
+# Blobs that can be easily separated
 X_2, y_2 = make_blobs(n_samples=300, centers=[[3, 6], [7, 0]], random_state=0)
 
 X = np.concatenate([X_1, X_2], axis=0)
@@ -214,9 +214,8 @@ def fit_and_plot_regression(model, data, feature_names, target_names):
 _ = plt.title("Synthetic dataset")
 
 # %% [markdown]
-# We will first train a shallow decision tree with `max_depth=2`. We would
-# expect this depth to be enough to separate the blobs that are easy to
-# separate.
+# We first train a shallow decision tree with `max_depth=2`. We would expect
+# this depth to be enough to separate the blobs that are easy to separate.
 
 # %%
 max_depth = 2
@@ -228,7 +227,7 @@ def fit_and_plot_regression(model, data, feature_names, target_names):
 
 # %% [markdown]
 # As expected, we see that the blue blob in the lower right and the red blob on
-# the top are easily separated. However, more splits will be required to better
+# the top are easily separated. However, more splits are required to better
 # split the blob were both blue and red data points are mixed.
 
 # %%
@@ -239,7 +238,7 @@ def fit_and_plot_regression(model, data, feature_names, target_names):
 
 # %% [markdown]
 # We see that the right branch achieves perfect classification. Now, we increase
-# the depth to check how the tree will grow.
+# the depth to check how the tree grows.
 
 # %%
 max_depth = 6
@@ -260,8 +259,8 @@ def fit_and_plot_regression(model, data, feature_names, target_names):
 # beneficial that a branch continue growing.
 #
 # The hyperparameters `min_samples_leaf`, `min_samples_split`, `max_leaf_nodes`,
-# or `min_impurity_decrease` allows growing asymmetric trees and apply a
-# constraint at the leaves or nodes level. We will check the effect of
+# or `min_impurity_decrease` allow growing asymmetric trees and apply a
+# constraint at the leaves or nodes level. We check the effect of
 # `min_samples_leaf`.
 
 # %%
@@ -280,5 +279,5 @@ def fit_and_plot_regression(model, data, feature_names, target_names):
 
 # %% [markdown]
 # This hyperparameter allows to have leaves with a minimum number of samples and
-# no further splits will be searched otherwise. Therefore, these hyperparameters
+# no further splits are searched otherwise. Therefore, these hyperparameters
 # could be an alternative to fix the `max_depth` hyperparameter.
diff --git a/python_scripts/trees_regression.py b/python_scripts/trees_regression.py
index 8431c025c..f354bee6f 100644
--- a/python_scripts/trees_regression.py
+++ b/python_scripts/trees_regression.py
@@ -31,9 +31,9 @@
 data_train, target_train = penguins[[feature_name]], penguins[target_name]
 
 # %% [markdown]
-# To illustrate how decision trees are predicting in a regression setting, we
-# will create a synthetic dataset containing all possible flipper length from
-# the minimum to the maximum of the original data.
+# To illustrate how decision trees predict in a regression setting, we create a
+# synthetic dataset containing some of the possible flipper length values
+# between the minimum and the maximum of the original data.
 
 # %%
 import numpy as np
@@ -53,9 +53,9 @@
 # some intuitive understanding on the shape of the decision function of the
 # learned decision trees.
 #
-# However computing an evaluation metric on such a synthetic test set would be
+# However, computing an evaluation metric on such a synthetic test set would be
 # meaningless since the synthetic dataset does not follow the same distribution
-# as the real world data on which the model will be deployed.
+# as the real world data on which the model would be deployed.
 
 # %%
 import matplotlib.pyplot as plt
@@ -67,7 +67,7 @@
 _ = plt.title("Illustration of the regression dataset used")
 
 # %% [markdown]
-# We will first illustrate the difference between a linear model and a decision
+# We first illustrate the difference between a linear model and a decision
 # tree.
 
 # %%
@@ -112,9 +112,8 @@
 
 # %% [markdown]
 # Contrary to linear models, decision trees are non-parametric models: they do
-# not make assumptions about the way data is distributed. This will affect the
-# prediction scheme. Repeating the above experiment will highlight the
-# differences.
+# not make assumptions about the way data is distributed. This affects the
+# prediction scheme. Repeating the above experiment highlights the differences.
 
 # %%
 from sklearn.tree import DecisionTreeRegressor
@@ -170,8 +169,8 @@
 _ = plt.title("Prediction function using a DecisionTreeRegressor")
 
 # %% [markdown]
-# Increasing the depth of the tree will increase the number of partition and
-# thus the number of constant values that the tree is capable of predicting.
+# Increasing the depth of the tree increases the number of partitions and thus
+# the number of constant values that the tree is capable of predicting.
 #
 # In this notebook, we highlighted the differences in behavior of a decision
 # tree used in a classification problem in contrast to a regression problem.
diff --git a/python_scripts/trees_sol_02.py b/python_scripts/trees_sol_02.py
index cc7d5dbce..2235ddaf1 100644
--- a/python_scripts/trees_sol_02.py
+++ b/python_scripts/trees_sol_02.py
@@ -14,7 +14,7 @@
 # By extrapolation, we refer to values predicted by a model outside of the range
 # of feature values seen during the training.
 #
-# We will first load the regression data.
+# We first load the regression data.
 
 # %%
 import pandas as pd
@@ -92,10 +92,10 @@
 # interpolate.
 
 # %% [markdown]
-# Now, we will check the extrapolation capabilities of each model. Create a
-# dataset containing a broader range of values than your previous dataset, in
-# other words, add values below and above the minimum and the maximum of the
-# flipper length seen during training.
+# Now, we check the extrapolation capabilities of each model. Create a dataset
+# containing a broader range of values than your previous dataset, in other
+# words, add values below and above the minimum and the maximum of the flipper
+# length seen during training.
 
 # %%
 # solution
@@ -131,9 +131,9 @@
 _ = plt.title("Prediction of linear model and a decision tree")
 
 # %% [markdown] tags=["solution"]
-# The linear model will extrapolate using the fitted model for flipper lengths <
-# 175 mm and > 235 mm. In fact, we are using the model parametrization to make
-# this predictions.
+# The linear model extrapolates using the fitted model for flipper lengths < 175
+# mm and > 235 mm. In fact, we are using the model parametrization to make these
+# predictions.
 #
 # As mentioned, decision trees are non-parametric models and we observe that
 # they cannot extrapolate. For flipper lengths below the minimum, the mass of