INRIA · ArturoAmorQ · Dec 9, 2024 · Dec 12, 2024
diff --git a/python_scripts/linear_models_sol_03.py b/python_scripts/linear_models_sol_03.py
@@ -8,8 +8,8 @@
 # %% [markdown]
 # # 📃 Solution for Exercise M4.03
 #
-# Now, we tackle a more realistic classification problem instead of making a
-# synthetic dataset. We start by loading the Adult Census dataset with the
+# Now, we tackle a (relatively) realistic classification problem instead of making
+# a synthetic dataset. We start by loading the Adult Census dataset with the
 # following snippet. For the moment we retain only the **numerical features**.
 
 # %%
@@ -24,10 +24,13 @@
 # %% [markdown]
 # We confirm that all the selected features are numerical.
 #
-# Compute the generalization performance in terms of accuracy of a linear model
-# composed of a `StandardScaler` and a `LogisticRegression`. Use a 10-fold
-# cross-validation with `return_estimator=True` to be able to inspect the
-# trained estimators.
+# Define a linear model composed of a `StandardScaler` followed by a
+# `LogisticRegression` with default parameters.
+#
+# Then use a 10-fold cross-validation to estimate its generalization performance
+# in terms of balanced accuracy, i.e. set `scoring="balanced_accuracy"` in the
+# `cross_validate` function. Also set `return_estimator=True` to be able to
+# inspect the trained estimators.
 
 # %%
 # solution
@@ -38,7 +41,12 @@
 
 model = make_pipeline(StandardScaler(), LogisticRegression())
 cv_results_lr = cross_validate(
-    model, data, target, cv=10, return_estimator=True
+    model,
+    data,
+    target,
+    cv=10,
+    return_estimator=True,
+    scoring="balanced_accuracy",
 )
 test_score_lr = cv_results_lr["test_score"]
 test_score_lr
@@ -84,11 +92,11 @@
 # - The numerical data must be scaled.
 # - The categorical data must be one-hot encoded, set `min_frequency=0.01` to
 #   group categories concerning less than 1% of the total samples.
-# - The predictor is a `LogisticRegression`. You may need to increase the number
-#   of `max_iter`, which is 100 by default.
+# - The predictor is a `LogisticRegression` with default parameters, except that
+#   you may need to increase the number of `max_iter`, which is 100 by default.
 #
-# Use the same 10-fold cross-validation strategy with `return_estimator=True` as
-# above to evaluate this complex pipeline.
+# Use the same 10-fold cross-validation strategy with `return_estimator=True`
+# and `scoring="balanced_accuracy"` as above to evaluate this complex pipeline.
 
 # %%
 # solution
@@ -108,7 +116,13 @@
 )
 model = make_pipeline(preprocessor, LogisticRegression(max_iter=5_000))
 cv_results_complex_lr = cross_validate(
-    model, data, target, cv=10, return_estimator=True, n_jobs=2
+    model,
+    data,
+    target,
+    cv=10,
+    return_estimator=True,
+    scoring="balanced_accuracy",
+    n_jobs=2,
 )
 test_score_complex_lr = cv_results_complex_lr["test_score"]
 test_score_complex_lr
@@ -135,7 +149,7 @@
 )
 plt.ylim((0, 1))
 plt.xlabel("Cross-validation iteration")
-plt.ylabel("Accuracy")
+plt.ylabel("Balanced accuracy")
 _ = plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
 
 print(
@@ -195,9 +209,9 @@
 
 # %% [markdown]
 # Now create a similar pipeline consisting of the same preprocessor as above,
-# followed by a `PolynomialFeatures` and a logistic regression with `C=0.01`.
-# Set `degree=2` and `interaction_only=True` to the feature engineering step.
-# Remember not to include a "bias" feature to avoid introducing a redundancy
+# followed by a `PolynomialFeatures` and a logistic regression with enough
+# `max_iter`. Set `interaction_only=True` to the feature engineering step; and
+# remember not to include a "bias" feature to avoid introducing a redundancy
 # with the intercept of the subsequent logistic regression.
 
 # %%
@@ -207,11 +221,16 @@
 model_with_interaction = make_pipeline(
     preprocessor,
     PolynomialFeatures(degree=2, include_bias=False, interaction_only=True),
-    LogisticRegression(C=0.01, max_iter=5_000),
+    LogisticRegression(max_iter=5_000),
 )
 model_with_interaction
 
 # %% [markdown]
+# Use the same 10-fold cross-validation strategy with
+# `scoring="balanced_accuracy"` as above to evaluate this complex pipeline. In
+# this case there is no need to return the estimator, as the number of features
+# is much larger to visually explore the learned coefficients.
+#
 # By comparing the cross-validation test scores of both models fold-to-fold,
 # count the number of times the model using multiplicative interactions and both
 # numerical and categorical features has a better test score than the model
@@ -224,7 +243,7 @@
     data,
     target,
     cv=10,
-    return_estimator=True,
+    scoring="balanced_accuracy",
     n_jobs=2,
 )
 test_score_interactions = cv_results_interactions["test_score"]
@@ -247,8 +266,9 @@
     color="black",
     label="all features and interactions",
 )
+plt.ylim((0, 1))
 plt.xlabel("Cross-validation iteration")
-plt.ylabel("Accuracy")
+plt.ylabel("Balanced accuracy")
 _ = plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
 
 print(