From 3ac8ce470f35feb246d8e2246d209056a39cc35c Mon Sep 17 00:00:00 2001 From: glemaitre Date: Fri, 26 Apr 2024 13:53:47 +0000 Subject: [PATCH] [ci skip] MAINT Fix typos and wording across the mooc (#764) Co-authored-by: ArturoAmorQ 12372259c6a189613cfbd646c60f9862ed7cb4a8 --- .../02_numerical_pipeline_introduction.py | 2 +- .../python_scripts/03_categorical_pipeline.py | 9 +- .../cross_validation_train_test.py | 2 +- _sources/python_scripts/ensemble_sol_02.py | 7 + .../python_scripts/linear_models_ex_04.py | 2 +- .../python_scripts/linear_models_sol_04.py | 2 +- _sources/python_scripts/metrics_regression.py | 5 +- appendix/notebook_timings.html | 52 ++--- .../02_numerical_pipeline_introduction.html | 2 +- python_scripts/03_categorical_pipeline.html | 13 +- ...tegorical_pipeline_column_transformer.html | 10 +- .../cross_validation_train_test.html | 206 +++++++++--------- python_scripts/ensemble_sol_02.html | 5 + python_scripts/linear_models_ex_04.html | 2 +- python_scripts/linear_models_sol_04.html | 2 +- python_scripts/metrics_regression.html | 5 +- .../parameter_tuning_grid_search.html | 100 ++++----- .../parameter_tuning_randomized_search.html | 196 ++++++++--------- searchindex.js | 2 +- 19 files changed, 320 insertions(+), 304 deletions(-) diff --git a/_sources/python_scripts/02_numerical_pipeline_introduction.py b/_sources/python_scripts/02_numerical_pipeline_introduction.py index 8a245611a..940065dc3 100644 --- a/_sources/python_scripts/02_numerical_pipeline_introduction.py +++ b/_sources/python_scripts/02_numerical_pipeline_introduction.py @@ -59,7 +59,7 @@ data # %% [markdown] -# We can now linger on the variables, also denominated features, that we later +# We can now focus on the variables, also denominated features, that we later # use to build our predictive model. In addition, we can also check how many # samples are available in our dataset. diff --git a/_sources/python_scripts/03_categorical_pipeline.py b/_sources/python_scripts/03_categorical_pipeline.py index 64b516070..844a072ca 100644 --- a/_sources/python_scripts/03_categorical_pipeline.py +++ b/_sources/python_scripts/03_categorical_pipeline.py @@ -253,7 +253,7 @@ # and check the generalization performance of this machine learning pipeline using # cross-validation. # -# Before we create the pipeline, we have to linger on the `native-country`. +# Before we create the pipeline, we have to focus on the `native-country`. # Let's recall some statistics regarding this column. # %% @@ -329,9 +329,10 @@ print(f"The accuracy is: {scores.mean():.3f} ± {scores.std():.3f}") # %% [markdown] -# As you can see, this representation of the categorical variables is -# slightly more predictive of the revenue than the numerical variables -# that we used previously. +# As you can see, this representation of the categorical variables is slightly +# more predictive of the revenue than the numerical variables that we used +# previously. The reason being that we have more (predictive) categorical +# features than numerical ones. # %% [markdown] # diff --git a/_sources/python_scripts/cross_validation_train_test.py b/_sources/python_scripts/cross_validation_train_test.py index f5bd73b01..68c640da4 100644 --- a/_sources/python_scripts/cross_validation_train_test.py +++ b/_sources/python_scripts/cross_validation_train_test.py @@ -12,7 +12,7 @@ # of predictive models. While this section could be slightly redundant, we # intend to go into details into the cross-validation framework. # -# Before we dive in, let's linger on the reasons for always having training and +# Before we dive in, let's focus on the reasons for always having training and # testing sets. Let's first look at the limitation of using a dataset without # keeping any samples out. # diff --git a/_sources/python_scripts/ensemble_sol_02.py b/_sources/python_scripts/ensemble_sol_02.py index 232ec2c04..061be3e52 100644 --- a/_sources/python_scripts/ensemble_sol_02.py +++ b/_sources/python_scripts/ensemble_sol_02.py @@ -103,3 +103,10 @@ plt.plot(data_range[feature_name], forest_predictions, label="Random forest") _ = plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") + +# %% [markdown] tags=["solution"] +# The random forest reduces the overfitting of the individual trees but still +# overfits itself. In the section on "hyperparameter tuning with ensemble +# methods" we will see how to further mitigate this effect. Still, interested +# users may increase the number of estimators in the forest and try different +# values of, e.g., `min_samples_split`. diff --git a/_sources/python_scripts/linear_models_ex_04.py b/_sources/python_scripts/linear_models_ex_04.py index dd9ae6bb1..473013074 100644 --- a/_sources/python_scripts/linear_models_ex_04.py +++ b/_sources/python_scripts/linear_models_ex_04.py @@ -17,7 +17,7 @@ # In the previous Module we tuned the hyperparameter `C` of the logistic # regression without mentioning that it controls the regularization strength. # Later, on the slides on 🎥 **Intuitions on regularized linear models** we -# metioned that a small `C` provides a more regularized model, whereas a +# mentioned that a small `C` provides a more regularized model, whereas a # non-regularized model is obtained with an infinitely large value of `C`. # Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge` # model. diff --git a/_sources/python_scripts/linear_models_sol_04.py b/_sources/python_scripts/linear_models_sol_04.py index 942aed56d..623afad8e 100644 --- a/_sources/python_scripts/linear_models_sol_04.py +++ b/_sources/python_scripts/linear_models_sol_04.py @@ -11,7 +11,7 @@ # In the previous Module we tuned the hyperparameter `C` of the logistic # regression without mentioning that it controls the regularization strength. # Later, on the slides on 🎥 **Intuitions on regularized linear models** we -# metioned that a small `C` provides a more regularized model, whereas a +# mentioned that a small `C` provides a more regularized model, whereas a # non-regularized model is obtained with an infinitely large value of `C`. # Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge` # model. diff --git a/_sources/python_scripts/metrics_regression.py b/_sources/python_scripts/metrics_regression.py index 494447732..4c6d91db7 100644 --- a/_sources/python_scripts/metrics_regression.py +++ b/_sources/python_scripts/metrics_regression.py @@ -97,8 +97,9 @@ # %% [markdown] # The $R^2$ score represents the proportion of variance of the target that is # explained by the independent variables in the model. The best score possible -# is 1 but there is no lower bound. However, a model that predicts the expected -# value of the target would get a score of 0. +# is 1 but there is no lower bound. However, a model that predicts the [expected +# value](https://en.wikipedia.org/wiki/Expected_value) of the target would get a +# score of 0. # %% from sklearn.dummy import DummyRegressor diff --git a/appendix/notebook_timings.html b/appendix/notebook_timings.html index 093691c5d..718f2058b 100644 --- a/appendix/notebook_timings.html +++ b/appendix/notebook_timings.html @@ -668,9 +668,9 @@

Notebook timings

python_scripts/01_tabular_data_exploration

-

2024-04-26 13:50

+

2024-04-26 13:51

cache

-

8.22

+

7.83

python_scripts/01_tabular_data_exploration_ex_01

@@ -704,15 +704,15 @@

Notebook timings

python_scripts/02_numerical_pipeline_hands_on

-

2024-04-26 13:50

+

2024-04-26 13:51

cache

-

2.01

+

1.98

python_scripts/02_numerical_pipeline_introduction

-

2024-04-26 13:50

+

2024-04-26 13:51

cache

-

4.8

+

5.06

python_scripts/02_numerical_pipeline_scaling

@@ -734,15 +734,15 @@

Notebook timings

python_scripts/03_categorical_pipeline

-

2024-04-26 13:50

+

2024-04-26 13:51

cache

-

2.8

+

3.06

python_scripts/03_categorical_pipeline_column_transformer

-

2024-04-26 13:50

+

2024-04-26 13:52

cache

-

4.23

+

4.42

python_scripts/03_categorical_pipeline_ex_01

@@ -836,9 +836,9 @@

Notebook timings

python_scripts/cross_validation_train_test

-

2024-04-26 13:51

+

2024-04-26 13:52

cache

-

10.87

+

11.39

python_scripts/cross_validation_validation_curve

@@ -1004,9 +1004,9 @@

Notebook timings

python_scripts/linear_models_ex_02

-

2024-04-26 13:51

+

2024-04-26 13:52

cache

-

1.09

+

1.17

python_scripts/linear_models_ex_03

@@ -1040,9 +1040,9 @@

Notebook timings

python_scripts/linear_models_sol_02

-

2024-04-26 13:51

+

2024-04-26 13:52

cache

-

6.1

+

6.45

python_scripts/linear_models_sol_03

@@ -1070,9 +1070,9 @@

Notebook timings

python_scripts/linear_regression_without_sklearn

-

2024-04-26 13:51

+

2024-04-26 13:52

cache

-

2.65

+

2.99

python_scripts/logistic_regression

@@ -1130,15 +1130,15 @@

Notebook timings

python_scripts/parameter_tuning_grid_search

-

2024-04-26 13:51

+

2024-04-26 13:52

cache

-

10.21

+

10.5

python_scripts/parameter_tuning_manual

-

2024-04-26 13:51

+

2024-04-26 13:52

cache

-

4.17

+

4.45

python_scripts/parameter_tuning_nested

@@ -1154,9 +1154,9 @@

Notebook timings

python_scripts/parameter_tuning_randomized_search

-

2024-04-26 13:51

+

2024-04-26 13:53

cache

-

24.21

+

22.88

python_scripts/parameter_tuning_sol_02

@@ -1178,9 +1178,9 @@

Notebook timings

python_scripts/trees_dataset

-

2024-04-26 13:51

+

2024-04-26 13:53

cache

-

2.75

+

3.06

python_scripts/trees_ex_01

diff --git a/python_scripts/02_numerical_pipeline_introduction.html b/python_scripts/02_numerical_pipeline_introduction.html index 233cfdf9b..38dcd179e 100644 --- a/python_scripts/02_numerical_pipeline_introduction.html +++ b/python_scripts/02_numerical_pipeline_introduction.html @@ -1003,7 +1003,7 @@

Separate the data and the target diff --git a/python_scripts/03_categorical_pipeline.html b/python_scripts/03_categorical_pipeline.html index 075cf7e43..78074b72e 100644 --- a/python_scripts/03_categorical_pipeline.html +++ b/python_scripts/03_categorical_pipeline.html @@ -1958,7 +1958,7 @@

Evaluate our predictive pipelinenative-country. +

Before we create the pipeline, we have to focus on the native-country. Let’s recall some statistics regarding this column.

@@ -2078,8 +2078,8 @@

Evaluate our predictive pipeline -
{'fit_time': array([0.18181372, 0.16217852, 0.17221045, 0.17812014, 0.16503692]),
- 'score_time': array([0.02207351, 0.02198744, 0.02215958, 0.02402902, 0.02280927]),
+
{'fit_time': array([0.18064904, 0.16906261, 0.17876267, 0.20569158, 0.17099452]),
+ 'score_time': array([0.02239752, 0.0232501 , 0.02577472, 0.02373815, 0.02260184]),
  'test_score': array([0.83232675, 0.83570478, 0.82831695, 0.83292383, 0.83497133])}
 
@@ -2098,9 +2098,10 @@

Evaluate our predictive pipeline
  • seen two common strategies for encoding categorical features: ordinal diff --git a/python_scripts/03_categorical_pipeline_column_transformer.html b/python_scripts/03_categorical_pipeline_column_transformer.html index 66739c5ec..e90c3797a 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.html +++ b/python_scripts/03_categorical_pipeline_column_transformer.html @@ -1571,8 +1571,8 @@

    Evaluation of the model with cross-validation -
    {'fit_time': array([0.24788404, 0.24504352, 0.22222066, 0.23252439, 0.26179743]),
    - 'score_time': array([0.02705979, 0.0278194 , 0.02626395, 0.02863431, 0.02582169]),
    +
    {'fit_time': array([0.25630689, 0.26000094, 0.22319031, 0.24449325, 0.26766682]),
    + 'score_time': array([0.02926874, 0.02974772, 0.02790833, 0.02923989, 0.02736449]),
      'test_score': array([0.85116184, 0.84993346, 0.8482801 , 0.85257985, 0.85544636])}
     
    @@ -1644,8 +1644,8 @@

    Fitting a more powerful model -
    CPU times: user 657 ms, sys: 15.8 ms, total: 672 ms
    -Wall time: 672 ms
    +
    CPU times: user 680 ms, sys: 12 ms, total: 692 ms
    +Wall time: 692 ms
     
    @@ -1657,7 +1657,7 @@

    Fitting a more powerful model -
    -
    {'fit_time': array([0.1639955 , 0.15742874, 0.15386415, 0.15139842, 0.15369725]),
    - 'score_time': array([0.00204206, 0.00168586, 0.00169134, 0.0016768 , 0.00165772]),
    +
    +

    The random forest reduces the overfitting of the individual trees but still +overfits itself. In the section on “hyperparameter tuning with ensemble +methods” we will see how to further mitigate this effect. Still, interested +users may increase the number of estimators in the forest and try different +values of, e.g., min_samples_split.