diff --git a/python_scripts/parameter_tuning_ex_02.py b/python_scripts/parameter_tuning_ex_02.py index cd0e5e3f0..ff119eb58 100644 --- a/python_scripts/parameter_tuning_ex_02.py +++ b/python_scripts/parameter_tuning_ex_02.py @@ -68,10 +68,10 @@ # %% [markdown] # Use the previously defined model (called `model`) and using two nested `for` # loops, make a search of the best combinations of the `learning_rate` and -# `max_leaf_nodes` parameters. In this regard, you will need to train and test -# the model by setting the parameters. The evaluation of the model should be -# performed using `cross_val_score` on the training set. We will use the -# following parameters search: +# `max_leaf_nodes` parameters. In this regard, you have to train and test the +# model by setting the parameters. The evaluation of the model should be +# performed using `cross_val_score` on the training set. Use the following +# parameters search: # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls # the ability of a new tree to correct the error of the previous sequence of # trees diff --git a/python_scripts/parameter_tuning_ex_03.py b/python_scripts/parameter_tuning_ex_03.py index 48c9a5c41..85dfda6db 100644 --- a/python_scripts/parameter_tuning_ex_03.py +++ b/python_scripts/parameter_tuning_ex_03.py @@ -29,7 +29,7 @@ ) # %% [markdown] -# In this exercise, we will progressively define the regression pipeline and +# In this exercise, we progressively define the regression pipeline and # later tune its hyperparameters. # # Start by defining a pipeline that: diff --git a/python_scripts/parameter_tuning_grid_search.py b/python_scripts/parameter_tuning_grid_search.py index 5219d0b51..12bbffb57 100644 --- a/python_scripts/parameter_tuning_grid_search.py +++ b/python_scripts/parameter_tuning_grid_search.py @@ -9,7 +9,7 @@ # # Hyperparameter tuning by grid-search # # In the previous notebook, we saw that hyperparameters can affect the -# generalization performance of a model. In this notebook, we will show how to +# generalization performance of a model. In this notebook, we show how to # optimize hyperparameters using a grid-search approach. # %% [markdown] @@ -49,8 +49,8 @@ ) # %% [markdown] -# We will define a pipeline as seen in the first module. It will handle both -# numerical and categorical features. +# We define a pipeline as seen in the first module, to handle both numerical and +# categorical features. # # The first step is to select all the categorical columns. @@ -61,7 +61,7 @@ categorical_columns = categorical_columns_selector(data) # %% [markdown] -# Here we will use a tree-based model as a classifier (i.e. +# Here we use a tree-based model as a classifier (i.e. # `HistGradientBoostingClassifier`). That means: # # * Numerical variables don't need scaling; @@ -119,8 +119,8 @@ # code. # # Let's see how to use the `GridSearchCV` estimator for doing such search. Since -# the grid-search will be costly, we will only explore the combination -# learning-rate and the maximum number of nodes. +# the grid-search is costly, we only explore the combination learning-rate and +# the maximum number of nodes. # %% # %%time @@ -134,7 +134,7 @@ model_grid_search.fit(data_train, target_train) # %% [markdown] -# Finally, we will check the accuracy of our model using the test set. +# Finally, we check the accuracy of our model using the test set. # %% accuracy = model_grid_search.score(data_test, target_test) @@ -155,17 +155,17 @@ # %% [markdown] # The `GridSearchCV` estimator takes a `param_grid` parameter which defines all -# hyperparameters and their associated values. The grid-search will be in charge +# hyperparameters and their associated values. The grid-search is in charge # of creating all possible combinations and test them. # -# The number of combinations will be equal to the product of the number of -# values to explore for each parameter (e.g. in our example 4 x 3 combinations). -# Thus, adding new parameters with their associated values to be explored become +# The number of combinations are equal to the product of the number of values to +# explore for each parameter (e.g. in our example 4 x 3 combinations). Thus, +# adding new parameters with their associated values to be explored become # rapidly computationally expensive. # # Once the grid-search is fitted, it can be used as any other predictor by -# calling `predict` and `predict_proba`. Internally, it will use the model with -# the best parameters found during `fit`. +# calling `predict` and `predict_proba`. Internally, it uses the model with the +# best parameters found during `fit`. # # Get predictions for the 5 first samples using the estimator with the best # parameters. @@ -186,8 +186,8 @@ # parameters "by hand" through a double for loop. # # In addition, we can inspect all results which are stored in the attribute -# `cv_results_` of the grid-search. We will filter some specific columns from -# these results. +# `cv_results_` of the grid-search. We filter some specific columns from these +# results. # %% cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values( @@ -220,9 +220,9 @@ def shorten_param(param_name): # With only 2 parameters, we might want to visualize the grid-search as a # heatmap. We need to transform our `cv_results` into a dataframe where: # -# - the rows will correspond to the learning-rate values; -# - the columns will correspond to the maximum number of leaf; -# - the content of the dataframe will be the mean test scores. +# - the rows correspond to the learning-rate values; +# - the columns correspond to the maximum number of leaf; +# - the content of the dataframe is the mean test scores. # %% pivoted_cv_results = cv_results.pivot_table( @@ -259,7 +259,7 @@ def shorten_param(param_name): # # The precise meaning of those two parameters will be explained later. # -# For now we will note that, in general, **there is no unique optimal parameter +# For now we note that, in general, **there is no unique optimal parameter # setting**: 4 models out of the 12 parameter configurations reach the maximal # accuracy (up to small random fluctuations caused by the sampling of the # training set). diff --git a/python_scripts/parameter_tuning_nested.py b/python_scripts/parameter_tuning_nested.py index fb7406274..e4ccc7102 100644 --- a/python_scripts/parameter_tuning_nested.py +++ b/python_scripts/parameter_tuning_nested.py @@ -12,12 +12,12 @@ # However, we did not present a proper framework to evaluate the tuned models. # Instead, we focused on the mechanism used to find the best set of parameters. # -# In this notebook, we will reuse some knowledge presented in the module -# "Selecting the best model" to show how to evaluate models where -# hyperparameters need to be tuned. +# In this notebook, we reuse some knowledge presented in the module "Selecting +# the best model" to show how to evaluate models where hyperparameters need to +# be tuned. # -# Thus, we will first load the dataset and create the predictive model that we -# want to optimize and later on, evaluate. +# Thus, we first load the dataset and create the predictive model that we want +# to optimize and later on, evaluate. # # ## Loading the dataset # @@ -111,7 +111,7 @@ # ### With hyperparameter tuning # # As shown in the previous notebook, one can use a search strategy that uses -# cross-validation to find the best set of parameters. Here, we will use a +# cross-validation to find the best set of parameters. Here, we use a # grid-search strategy and reproduce the steps done in the previous notebook. # # First, we have to embed our model into a grid-search and specify the diff --git a/python_scripts/parameter_tuning_parallel_plot.py b/python_scripts/parameter_tuning_parallel_plot.py index 304585cb0..340e75dd0 100644 --- a/python_scripts/parameter_tuning_parallel_plot.py +++ b/python_scripts/parameter_tuning_parallel_plot.py @@ -110,8 +110,8 @@ def shorten_param(param_name): # spread the active ranges and improve the readability of the plot. # ``` # -# The parallel coordinates plot will display the values of the hyperparameters -# on different columns while the performance metric is color coded. Thus, we are +# The parallel coordinates plot displays the values of the hyperparameters on +# different columns while the performance metric is color coded. Thus, we are # able to quickly inspect if there is a range of hyperparameters which is # working or not. # diff --git a/python_scripts/parameter_tuning_sol_02.py b/python_scripts/parameter_tuning_sol_02.py index 1ea4cf572..9c5ceaa2c 100644 --- a/python_scripts/parameter_tuning_sol_02.py +++ b/python_scripts/parameter_tuning_sol_02.py @@ -62,10 +62,10 @@ # %% [markdown] # Use the previously defined model (called `model`) and using two nested `for` # loops, make a search of the best combinations of the `learning_rate` and -# `max_leaf_nodes` parameters. In this regard, you will need to train and test -# the model by setting the parameters. The evaluation of the model should be -# performed using `cross_val_score` on the training set. We will use the -# following parameters search: +# `max_leaf_nodes` parameters. In this regard, you need to train and test the +# model by setting the parameters. The evaluation of the model should be +# performed using `cross_val_score` on the training set. Use the following +# parameters search: # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls # the ability of a new tree to correct the error of the previous sequence of # trees diff --git a/python_scripts/parameter_tuning_sol_03.py b/python_scripts/parameter_tuning_sol_03.py index 149cc0de1..55773ebd3 100644 --- a/python_scripts/parameter_tuning_sol_03.py +++ b/python_scripts/parameter_tuning_sol_03.py @@ -23,8 +23,8 @@ ) # %% [markdown] -# In this exercise, we will progressively define the regression pipeline and -# later tune its hyperparameters. +# In this exercise, we progressively define the regression pipeline and later +# tune its hyperparameters. # # Start by defining a pipeline that: # * uses a `StandardScaler` to normalize the numerical data; @@ -108,8 +108,8 @@ cv_results = pd.DataFrame(model_random_search.cv_results_) # %% [markdown] tags=["solution"] -# To simplify the axis of the plot, we will rename the column of the dataframe -# and only select the mean test score and the value of the hyperparameters. +# To simplify the axis of the plot, we rename the column of the dataframe and +# only select the mean test score and the value of the hyperparameters. # %% tags=["solution"] column_name_mapping = { @@ -170,7 +170,7 @@ # vary between 0 and 10,000 (e.g. the variable `"Population"`) and B is a # feature that varies between 1 and 10 (e.g. the variable `"AveRooms"`), then # distances between samples (rows of the dataframe) are mostly impacted by -# differences in values of the column A, while values of the column B will be +# differences in values of the column A, while values of the column B are # comparatively ignored. If one applies StandardScaler to such a database, both # the values of A and B will be approximately between -3 and 3 and the neighbor # structure will be impacted more or less equivalently by both variables. diff --git a/python_scripts/trees_dataset.py b/python_scripts/trees_dataset.py index 708c61b29..888eee5a7 100644 --- a/python_scripts/trees_dataset.py +++ b/python_scripts/trees_dataset.py @@ -15,7 +15,7 @@ # # ## Classification dataset # -# We will use this dataset in classification setting to predict the penguins' +# We use this dataset in classification setting to predict the penguins' # species from anatomical information. # # Each penguin is from one of the three following species: Adelie, Gentoo, and @@ -26,15 +26,15 @@ # penguins](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/lter_penguins.png) # # This problem is a classification problem since the target is categorical. We -# will limit our input data to a subset of the original features to simplify our -# explanations when presenting the decision tree algorithm. Indeed, we will use +# limit our input data to a subset of the original features to simplify our +# explanations when presenting the decision tree algorithm. Indeed, we use # features based on penguins' culmen measurement. You can learn more about the # penguins' culmen with the illustration below: # # ![Image of # culmen](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/culmen_depth.png) # -# We will start by loading this subset of the dataset. +# We start by loading this subset of the dataset. # %% import pandas as pd @@ -73,11 +73,11 @@ # # In a regression setting, the target is a continuous variable instead of # categories. Here, we use two features of the dataset to make such a problem: -# the flipper length will be used as data and the body mass will be the target. -# In short, we want to predict the body mass using the flipper length. +# the flipper length is used as data and the body mass as the target. In short, +# we want to predict the body mass using the flipper length. # -# We will load the dataset and visualize the relationship between the flipper -# length and the body mass of penguins. +# We load the dataset and visualize the relationship between the flipper length +# and the body mass of penguins. # %% penguins = pd.read_csv("../datasets/penguins_regression.csv") diff --git a/python_scripts/trees_ex_02.py b/python_scripts/trees_ex_02.py index 6c7d3b1b1..f53fb7566 100644 --- a/python_scripts/trees_ex_02.py +++ b/python_scripts/trees_ex_02.py @@ -20,7 +20,7 @@ # By extrapolation, we refer to values predicted by a model outside of the range # of feature values seen during the training. # -# We will first load the regression data. +# We first load the regression data. # %% import pandas as pd @@ -61,10 +61,10 @@ # Write your code here. # %% [markdown] -# Now, we will check the extrapolation capabilities of each model. Create a -# dataset containing a broader range of values than your previous dataset, in -# other words, add values below and above the minimum and the maximum of the -# flipper length seen during training. +# Now, we check the extrapolation capabilities of each model. Create a dataset +# containing a broader range of values than your previous dataset, in other +# words, add values below and above the minimum and the maximum of the flipper +# length seen during training. # %% # Write your code here. diff --git a/python_scripts/trees_hyperparameters.py b/python_scripts/trees_hyperparameters.py index dc8594968..218e9b38b 100644 --- a/python_scripts/trees_hyperparameters.py +++ b/python_scripts/trees_hyperparameters.py @@ -8,11 +8,11 @@ # %% [markdown] # # Importance of decision tree hyperparameters on generalization # -# In this notebook, we will illustrate the importance of some key -# hyperparameters on the decision tree; we will demonstrate their effects on the -# classification and regression problems we saw previously. +# In this notebook, we illustrate the importance of some key hyperparameters on +# the decision tree; we demonstrate their effects on the classification and +# regression problems we saw previously. # -# First, we will load the classification and regression datasets. +# First, we load the classification and regression datasets. # %% import pandas as pd @@ -35,7 +35,7 @@ # %% [markdown] # ## Create helper functions # -# We will create some helper functions to plot the data samples as well as the +# We create some helper functions to plot the data samples as well as the # decision boundary for classification and the regression line for regression. # %% @@ -135,10 +135,10 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% [markdown] # For both classification and regression setting, we observe that increasing the -# depth will make the tree model more expressive. However, a tree that is too -# deep will overfit the training data, creating partitions which are only -# correct for "outliers" (noisy samples). The `max_depth` is one of the -# hyperparameters that one should optimize via cross-validation and grid-search. +# depth makes the tree model more expressive. However, a tree that is too deep +# may overfit the training data, creating partitions which are only correct for +# "outliers" (noisy samples). The `max_depth` is one of the hyperparameters that +# one should optimize via cross-validation and grid-search. # %% from sklearn.model_selection import GridSearchCV @@ -172,15 +172,15 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # # The `max_depth` hyperparameter controls the overall complexity of the tree. # This parameter is adequate under the assumption that a tree is built -# symmetrically. However, there is no guarantee that a tree will be symmetrical. +# symmetrically. However, there is no reason why a tree should be symmetrical. # Indeed, optimal generalization performance could be reached by growing some of # the branches deeper than some others. # -# We will build a dataset where we will illustrate this asymmetry. We will -# generate a dataset composed of 2 subsets: one subset where a clear separation -# should be found by the tree and another subset where samples from both classes -# will be mixed. It implies that a decision tree will need more splits to -# classify properly samples from the second subset than from the first subset. +# We build a dataset where we illustrate this asymmetry. We generate a dataset +# composed of 2 subsets: one subset where a clear separation should be found by +# the tree and another subset where samples from both classes are mixed. It +# implies that a decision tree needs more splits to classify properly samples +# from the second subset than from the first subset. # %% from sklearn.datasets import make_blobs @@ -188,11 +188,11 @@ def fit_and_plot_regression(model, data, feature_names, target_names): data_clf_columns = ["Feature #0", "Feature #1"] target_clf_column = "Class" -# Blobs that will be interlaced +# Blobs that are interlaced X_1, y_1 = make_blobs( n_samples=300, centers=[[0, 0], [-1, -1]], random_state=0 ) -# Blobs that will be easily separated +# Blobs that can be easily separated X_2, y_2 = make_blobs(n_samples=300, centers=[[3, 6], [7, 0]], random_state=0) X = np.concatenate([X_1, X_2], axis=0) @@ -214,9 +214,8 @@ def fit_and_plot_regression(model, data, feature_names, target_names): _ = plt.title("Synthetic dataset") # %% [markdown] -# We will first train a shallow decision tree with `max_depth=2`. We would -# expect this depth to be enough to separate the blobs that are easy to -# separate. +# We first train a shallow decision tree with `max_depth=2`. We would expect +# this depth to be enough to separate the blobs that are easy to separate. # %% max_depth = 2 @@ -228,7 +227,7 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% [markdown] # As expected, we see that the blue blob in the lower right and the red blob on -# the top are easily separated. However, more splits will be required to better +# the top are easily separated. However, more splits are required to better # split the blob were both blue and red data points are mixed. # %% @@ -239,7 +238,7 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% [markdown] # We see that the right branch achieves perfect classification. Now, we increase -# the depth to check how the tree will grow. +# the depth to check how the tree grows. # %% max_depth = 6 @@ -260,8 +259,8 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # beneficial that a branch continue growing. # # The hyperparameters `min_samples_leaf`, `min_samples_split`, `max_leaf_nodes`, -# or `min_impurity_decrease` allows growing asymmetric trees and apply a -# constraint at the leaves or nodes level. We will check the effect of +# or `min_impurity_decrease` allow growing asymmetric trees and apply a +# constraint at the leaves or nodes level. We check the effect of # `min_samples_leaf`. # %% @@ -280,5 +279,5 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% [markdown] # This hyperparameter allows to have leaves with a minimum number of samples and -# no further splits will be searched otherwise. Therefore, these hyperparameters +# no further splits are searched otherwise. Therefore, these hyperparameters # could be an alternative to fix the `max_depth` hyperparameter. diff --git a/python_scripts/trees_regression.py b/python_scripts/trees_regression.py index 8431c025c..f354bee6f 100644 --- a/python_scripts/trees_regression.py +++ b/python_scripts/trees_regression.py @@ -31,9 +31,9 @@ data_train, target_train = penguins[[feature_name]], penguins[target_name] # %% [markdown] -# To illustrate how decision trees are predicting in a regression setting, we -# will create a synthetic dataset containing all possible flipper length from -# the minimum to the maximum of the original data. +# To illustrate how decision trees predict in a regression setting, we create a +# synthetic dataset containing some of the possible flipper length values +# between the minimum and the maximum of the original data. # %% import numpy as np @@ -53,9 +53,9 @@ # some intuitive understanding on the shape of the decision function of the # learned decision trees. # -# However computing an evaluation metric on such a synthetic test set would be +# However, computing an evaluation metric on such a synthetic test set would be # meaningless since the synthetic dataset does not follow the same distribution -# as the real world data on which the model will be deployed. +# as the real world data on which the model would be deployed. # %% import matplotlib.pyplot as plt @@ -67,7 +67,7 @@ _ = plt.title("Illustration of the regression dataset used") # %% [markdown] -# We will first illustrate the difference between a linear model and a decision +# We first illustrate the difference between a linear model and a decision # tree. # %% @@ -112,9 +112,8 @@ # %% [markdown] # Contrary to linear models, decision trees are non-parametric models: they do -# not make assumptions about the way data is distributed. This will affect the -# prediction scheme. Repeating the above experiment will highlight the -# differences. +# not make assumptions about the way data is distributed. This affects the +# prediction scheme. Repeating the above experiment highlights the differences. # %% from sklearn.tree import DecisionTreeRegressor @@ -170,8 +169,8 @@ _ = plt.title("Prediction function using a DecisionTreeRegressor") # %% [markdown] -# Increasing the depth of the tree will increase the number of partition and -# thus the number of constant values that the tree is capable of predicting. +# Increasing the depth of the tree increases the number of partitions and thus +# the number of constant values that the tree is capable of predicting. # # In this notebook, we highlighted the differences in behavior of a decision # tree used in a classification problem in contrast to a regression problem. diff --git a/python_scripts/trees_sol_02.py b/python_scripts/trees_sol_02.py index cc7d5dbce..2235ddaf1 100644 --- a/python_scripts/trees_sol_02.py +++ b/python_scripts/trees_sol_02.py @@ -14,7 +14,7 @@ # By extrapolation, we refer to values predicted by a model outside of the range # of feature values seen during the training. # -# We will first load the regression data. +# We first load the regression data. # %% import pandas as pd @@ -92,10 +92,10 @@ # interpolate. # %% [markdown] -# Now, we will check the extrapolation capabilities of each model. Create a -# dataset containing a broader range of values than your previous dataset, in -# other words, add values below and above the minimum and the maximum of the -# flipper length seen during training. +# Now, we check the extrapolation capabilities of each model. Create a dataset +# containing a broader range of values than your previous dataset, in other +# words, add values below and above the minimum and the maximum of the flipper +# length seen during training. # %% # solution @@ -131,9 +131,9 @@ _ = plt.title("Prediction of linear model and a decision tree") # %% [markdown] tags=["solution"] -# The linear model will extrapolate using the fitted model for flipper lengths < -# 175 mm and > 235 mm. In fact, we are using the model parametrization to make -# this predictions. +# The linear model extrapolates using the fitted model for flipper lengths < 175 +# mm and > 235 mm. In fact, we are using the model parametrization to make these +# predictions. # # As mentioned, decision trees are non-parametric models and we observe that # they cannot extrapolate. For flipper lengths below the minimum, the mass of