From 767499b99c793ea8172d2357a7130bba1f68474c Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 26 Oct 2023 13:48:38 +0200 Subject: [PATCH] ENH Mention scaling behavior of binning and splines (#739) Co-authored-by: ArturoAmorQ Co-authored-by: Olivier Grisel --- ...dels_feature_engineering_classification.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/python_scripts/linear_models_feature_engineering_classification.py b/python_scripts/linear_models_feature_engineering_classification.py index 9fd203f34..12a2997da 100644 --- a/python_scripts/linear_models_feature_engineering_classification.py +++ b/python_scripts/linear_models_feature_engineering_classification.py @@ -235,7 +235,10 @@ def plot_decision_boundary(model, title=None): # %% from sklearn.preprocessing import KBinsDiscretizer -classifier = make_pipeline(KBinsDiscretizer(n_bins=5), LogisticRegression()) +classifier = make_pipeline( + KBinsDiscretizer(n_bins=5, encode="onehot"), # already the default params + LogisticRegression(), +) classifier # %% @@ -279,15 +282,20 @@ def plot_decision_boundary(model, title=None): # We can see that the decision boundary is now smooth, and while it favors # axis-aligned decision rules when extrapolating in low density regions, it can # adopt a more curvy decision boundary in the high density regions. -# -# Note however, that the number of knots is a hyperparameter that needs to be -# tuned. If we use too few knots, the model would underfit the data, as shown on -# the moons dataset. If we use too many knots, the model would overfit the data. -# # However, as for the binning transformation, the model still fails to separate # the data for the XOR dataset, irrespective of the number of knots, for the # same reasons: **the spline transformation is a feature-wise transformation** # and thus **cannot capture interactions** between features. +# +# Take into account that the number of knots is a hyperparameter that needs to be +# tuned. If we use too few knots, the model would underfit the data, as shown on +# the moons dataset. If we use too many knots, the model would overfit the data. +# +# ```{note} +# Notice that `KBinsDiscretizer(encode="onehot")` and `SplineTransformer` do not +# require additional scaling. Indeed, they can replace the scaling step for +# numerical features: they both create features with values in the [0, 1] range. +# ``` # %% [markdown] #