INRIA
diff --git a/‎_images/040319a5f0195151ade57c2b00d5a026012efc5f64c4fae5dfd2c1ed08598537.png
47.9 KB b/‎_images/040319a5f0195151ade57c2b00d5a026012efc5f64c4fae5dfd2c1ed08598537.png
47.9 KB
diff --git a/‎_images/0ca9fc6490289a85d2ccf5684056eca633ab33932d967b4753d66b1e885cb7dd.png
39.2 KB b/‎_images/0ca9fc6490289a85d2ccf5684056eca633ab33932d967b4753d66b1e885cb7dd.png
39.2 KB
diff --git a/‎_images/11181ee9e46af7a93c27d1d30a243a9691e8b27e106335e47543d1ff0e5e9845.png
214 KB b/‎_images/11181ee9e46af7a93c27d1d30a243a9691e8b27e106335e47543d1ff0e5e9845.png
214 KB
diff --git a/‎_images/124ca54eb9e5f8a334a26e971836ff232a64d732c55089ffdc7d92ed1fcd948a.png
-55.9 KB b/‎_images/124ca54eb9e5f8a334a26e971836ff232a64d732c55089ffdc7d92ed1fcd948a.png
-55.9 KB
diff --git a/‎_images/1937fc917f9ec711cabb21379467e2581c2d20c05ed7bb1762efa2a637a8d09f.png
55.6 KB b/‎_images/1937fc917f9ec711cabb21379467e2581c2d20c05ed7bb1762efa2a637a8d09f.png
55.6 KB
diff --git a/‎_images/1c9a910bb564f0c4ae65d3c07d4efbc95a36a06277e0a9dbf11d99da21c57687.png
-72.9 KB b/‎_images/1c9a910bb564f0c4ae65d3c07d4efbc95a36a06277e0a9dbf11d99da21c57687.png
-72.9 KB
diff --git a/‎_images/24eb883c10e0f90aa71b95ffb0b69a04e8693bde59a75fc7071626bc2c9fff54.png
-43.8 KB b/‎_images/24eb883c10e0f90aa71b95ffb0b69a04e8693bde59a75fc7071626bc2c9fff54.png
-43.8 KB
diff --git a/‎_images/29b08db4fca041251674b6275ae0a598f18c1b6fcda1a3a3aeeadcc264127fd1.png
-162 KB b/‎_images/29b08db4fca041251674b6275ae0a598f18c1b6fcda1a3a3aeeadcc264127fd1.png
-162 KB
diff --git a/‎_images/32a23ec05dec68bd897ab224f2ebde240c668c6332e5bc2b3a3676407411755f.png
-26.4 KB b/‎_images/32a23ec05dec68bd897ab224f2ebde240c668c6332e5bc2b3a3676407411755f.png
-26.4 KB
diff --git a/‎_images/35b80d7133a2aeb4a9d416940275c0b3a3f02b3499188d50a47b17a0f8fd2c42.png
27.6 KB b/‎_images/35b80d7133a2aeb4a9d416940275c0b3a3f02b3499188d50a47b17a0f8fd2c42.png
27.6 KB
diff --git a/‎_images/38abb3f9ca9021355b7e3d9938dc008c702465cb985c2a08e8089dab6a5eb6f6.png
-24.2 KB b/‎_images/38abb3f9ca9021355b7e3d9938dc008c702465cb985c2a08e8089dab6a5eb6f6.png
-24.2 KB
diff --git a/‎_images/394925fc044eebd0c797755b88339acff33d3c5539ab69157f9f9d84113ea89a.png
23.6 KB b/‎_images/394925fc044eebd0c797755b88339acff33d3c5539ab69157f9f9d84113ea89a.png
23.6 KB
diff --git a/‎_images/3d3fdf92d4180fd1d4ac7382ea70912f17e8915c8892514bc90d859e17f19ae7.png
37.2 KB b/‎_images/3d3fdf92d4180fd1d4ac7382ea70912f17e8915c8892514bc90d859e17f19ae7.png
37.2 KB
diff --git a/‎_images/3dd5914eb36d4e8b5b78da7e52d1551cf8cd0901763e1dfcf6dd42ef4e9ea598.png
121 KB b/‎_images/3dd5914eb36d4e8b5b78da7e52d1551cf8cd0901763e1dfcf6dd42ef4e9ea598.png
121 KB
diff --git a/‎_images/4d876d94aee2719242808a6ed4e019522a6151a7593f327312c01968cdcc2def.png
34.6 KB b/‎_images/4d876d94aee2719242808a6ed4e019522a6151a7593f327312c01968cdcc2def.png
34.6 KB
diff --git a/‎_images/52d762ca969e1f9cfb988b20dcd21c268211de4bd0ebbcaf6bcee275660abe06.png
-215 KB b/‎_images/52d762ca969e1f9cfb988b20dcd21c268211de4bd0ebbcaf6bcee275660abe06.png
-215 KB
diff --git a/‎_images/555578314ceebc2f1b7a4d2a1998df974f84cf2cbc9cbb08d25e5369091498ac.png
27.7 KB b/‎_images/555578314ceebc2f1b7a4d2a1998df974f84cf2cbc9cbb08d25e5369091498ac.png
27.7 KB
diff --git a/‎_images/5c65ca90787c8e6742d2a53b828698553459f362f76cd122397ba9e2dcad159c.png
45.9 KB b/‎_images/5c65ca90787c8e6742d2a53b828698553459f362f76cd122397ba9e2dcad159c.png
45.9 KB
diff --git a/‎_images/5dc6c6b16aadc0ecc8ccc9671c9af836d9e8b4813a61a624d5e12e581d94bfd2.png
-37.2 KB b/‎_images/5dc6c6b16aadc0ecc8ccc9671c9af836d9e8b4813a61a624d5e12e581d94bfd2.png
-37.2 KB
diff --git a/‎_images/63d3cf08bda3720ec304077be682ee51bc79c34c7196ff82a53671be62d73cff.png
-45.8 KB b/‎_images/63d3cf08bda3720ec304077be682ee51bc79c34c7196ff82a53671be62d73cff.png
-45.8 KB
diff --git a/‎_images/643cc50e8fc583698249115122b8f2c6f39ec11b1a18931668d289358cbca35c.png
-121 KB b/‎_images/643cc50e8fc583698249115122b8f2c6f39ec11b1a18931668d289358cbca35c.png
-121 KB
diff --git a/‎_images/757c434e80380d5b18c131b1b28447d7fbc2be0d96a41bb24717a1b9bfc9259b.png
118 KB b/‎_images/757c434e80380d5b18c131b1b28447d7fbc2be0d96a41bb24717a1b9bfc9259b.png
118 KB
diff --git a/‎_images/7658c25b1d0b50bf43781fe392834f429406854cec679bd0b084ac6d38f31161.png
-34.3 KB b/‎_images/7658c25b1d0b50bf43781fe392834f429406854cec679bd0b084ac6d38f31161.png
-34.3 KB
diff --git a/‎_images/77591727d16974ce7928c28b8bdabd78d90da029215f6b075b92fcff2b2caa65.png
53.5 KB b/‎_images/77591727d16974ce7928c28b8bdabd78d90da029215f6b075b92fcff2b2caa65.png
53.5 KB
diff --git a/‎_images/7ae510cbabb001087f0797dc73fcd66126e823d742f6edeacc4fca63e3ea0400.png
26.2 KB b/‎_images/7ae510cbabb001087f0797dc73fcd66126e823d742f6edeacc4fca63e3ea0400.png
26.2 KB
diff --git a/‎_images/7f9b2cda6e584bb54136734405691c279aadd366e6cb7db80384e5eef5bb491b.png
163 KB b/‎_images/7f9b2cda6e584bb54136734405691c279aadd366e6cb7db80384e5eef5bb491b.png
163 KB
diff --git a/‎_images/810316df41f8b2433909265c2c5a37aef75885536d8320f1b18fdb69f334a998.png
-31.3 KB b/‎_images/810316df41f8b2433909265c2c5a37aef75885536d8320f1b18fdb69f334a998.png
-31.3 KB
diff --git a/‎_images/8292907256162dc4777ea1154c4d0e3fa8f6fec41f1c18e63eb80934f504919c.png
-119 KB b/‎_images/8292907256162dc4777ea1154c4d0e3fa8f6fec41f1c18e63eb80934f504919c.png
-119 KB
diff --git a/‎_images/82bb7116040144db5f7db202b545c69119505a70e3dfa6402098f17b3aea0ff6.png
47.2 KB b/‎_images/82bb7116040144db5f7db202b545c69119505a70e3dfa6402098f17b3aea0ff6.png
47.2 KB
diff --git a/‎_images/8fe4ccdd818f5b41e5038348b74f95694fdeab499024d34c12cb4fce519b10f0.png
95.2 KB b/‎_images/8fe4ccdd818f5b41e5038348b74f95694fdeab499024d34c12cb4fce519b10f0.png
95.2 KB
diff --git a/‎_images/909a6a8a536adcd2d3dad57baf26d7d839cd07268ae0d42a4656e68fde831f60.png
37.5 KB b/‎_images/909a6a8a536adcd2d3dad57baf26d7d839cd07268ae0d42a4656e68fde831f60.png
37.5 KB
diff --git a/‎_images/94c609b2df347c9a248d1d055e9351a2a549bbbc3731cf7cde7d4612288d895d.png
-35.6 KB b/‎_images/94c609b2df347c9a248d1d055e9351a2a549bbbc3731cf7cde7d4612288d895d.png
-35.6 KB
diff --git a/‎_images/9943411be96f17a4d70f80b9fbf089e9966bd8a2a8d8d8c934762182642e641e.png
33.2 KB b/‎_images/9943411be96f17a4d70f80b9fbf089e9966bd8a2a8d8d8c934762182642e641e.png
33.2 KB
diff --git a/‎_images/a147f5902bfb91ec808811827ed327c745dc01d837cd82fffdef4b1613ec6fd5.png
29.9 KB b/‎_images/a147f5902bfb91ec808811827ed327c745dc01d837cd82fffdef4b1613ec6fd5.png
29.9 KB
diff --git a/‎_images/b3aa384e0fcc4fcb68c78395cd15ac31bcfcafa108c0c677000eb86a48b30afe.png
-96 KB b/‎_images/b3aa384e0fcc4fcb68c78395cd15ac31bcfcafa108c0c677000eb86a48b30afe.png
-96 KB
diff --git a/‎_images/c08b8ba82989ecfcbc08df8cd60f2ffd0175d7918ae2912ced2c051fff507a73.png
78.5 KB b/‎_images/c08b8ba82989ecfcbc08df8cd60f2ffd0175d7918ae2912ced2c051fff507a73.png
78.5 KB
diff --git a/‎_images/c5f4e0a05df7aa9837673652bf20d96781ab5cccd3c7f260e15695a2bc4d6f6c.png
-33.3 KB b/‎_images/c5f4e0a05df7aa9837673652bf20d96781ab5cccd3c7f260e15695a2bc4d6f6c.png
-33.3 KB
diff --git a/‎_images/cb4236c638e10b9407c31be3e9b9e1bb4985b727dd55f8e27d207309ce5416ad.png
-29.9 KB b/‎_images/cb4236c638e10b9407c31be3e9b9e1bb4985b727dd55f8e27d207309ce5416ad.png
-29.9 KB
diff --git a/‎_images/cbadae88f449729f7468934ac24a043e283e24240451943f5c205429b11c5e9b.png
-39.6 KB b/‎_images/cbadae88f449729f7468934ac24a043e283e24240451943f5c205429b11c5e9b.png
-39.6 KB
diff --git a/‎_images/cf448864a5e7cf4a3a2622c2ab84c5470182b9388c9496efb477ec36e0509acf.png
43.7 KB b/‎_images/cf448864a5e7cf4a3a2622c2ab84c5470182b9388c9496efb477ec36e0509acf.png
43.7 KB
diff --git a/‎_images/de5d3caa5ed9790bf89661e888091fdba23d49b03f31c16d76232f16583632d9.png
-26.4 KB b/‎_images/de5d3caa5ed9790bf89661e888091fdba23d49b03f31c16d76232f16583632d9.png
-26.4 KB
diff --git a/‎_images/dec7cf826a7a718ab6eee6f30dd998b85453386b291d3cbf6976b8d5d326f2d6.png
-27.7 KB b/‎_images/dec7cf826a7a718ab6eee6f30dd998b85453386b291d3cbf6976b8d5d326f2d6.png
-27.7 KB
diff --git a/‎_images/df6948cf6933964dd091ffc7103ff3e0313ca4f982b443b2e8917c2792734616.png
-79.9 KB b/‎_images/df6948cf6933964dd091ffc7103ff3e0313ca4f982b443b2e8917c2792734616.png
-79.9 KB
diff --git a/‎_images/dfe3f1c4a475be4805ac7b1e9d58bce63ea6f02a548a1e09bb0d0c1b901bd482.png
69.5 KB b/‎_images/dfe3f1c4a475be4805ac7b1e9d58bce63ea6f02a548a1e09bb0d0c1b901bd482.png
69.5 KB
diff --git a/‎_images/eb16c2bcbfe85d3419459ebbe5f3dd2123ff961a357bbc96d2fdc4af9ce12fff.png
-27.7 KB b/‎_images/eb16c2bcbfe85d3419459ebbe5f3dd2123ff961a357bbc96d2fdc4af9ce12fff.png
-27.7 KB
diff --git a/‎_images/ecd955b1a71a6710f72635f1222d6a83b183691b312512083a4c05ecb928daa0.png
28.2 KB b/‎_images/ecd955b1a71a6710f72635f1222d6a83b183691b312512083a4c05ecb928daa0.png
28.2 KB
diff --git a/‎_images/ed529f2218f9b750952c83ccaeb3511af6f11e57d965b4364b529a255cc3b92d.png
-27.5 KB b/‎_images/ed529f2218f9b750952c83ccaeb3511af6f11e57d965b4364b529a255cc3b92d.png
-27.5 KB
diff --git a/‎_images/ee6a779de4df539b7d843ed0703a703fb1e98a62df4102e7344019487c8d036b.png
26.6 KB b/‎_images/ee6a779de4df539b7d843ed0703a703fb1e98a62df4102e7344019487c8d036b.png
26.6 KB
diff --git a/‎_images/ee809cfcac206c134d750f5454fdf67ebafc806968ec22e3403db70c41a0f2ef.png
-27.7 KB b/‎_images/ee809cfcac206c134d750f5454fdf67ebafc806968ec22e3403db70c41a0f2ef.png
-27.7 KB
diff --git a/‎_images/f2fcd8487d8d59026fa52b6ff31eb7b927c07194c1c3d72426fc9dbde6f216be.png
27.5 KB b/‎_images/f2fcd8487d8d59026fa52b6ff31eb7b927c07194c1c3d72426fc9dbde6f216be.png
27.5 KB
diff --git a/‎_images/f615c67b23af83757b967faff382ec4adc17ea3fdffff619d004530a94a3652d.png
-47.6 KB b/‎_images/f615c67b23af83757b967faff382ec4adc17ea3fdffff619d004530a94a3652d.png
-47.6 KB
diff --git a/‎_images/f78fbb9341b010fa96152829e44ef79a3f5cd1d6f5e1dd5d3b1eee745bf7c7ae.png
200 KB b/‎_images/f78fbb9341b010fa96152829e44ef79a3f5cd1d6f5e1dd5d3b1eee745bf7c7ae.png
200 KB
diff --git a/‎_images/f984a06f24befe5f60f9038c5b1cbf1b558e23858a1c78ab5e66af6f3287515d.png
-53.4 KB b/‎_images/f984a06f24befe5f60f9038c5b1cbf1b558e23858a1c78ab5e66af6f3287515d.png
-53.4 KB
diff --git a/‎_images/fc189aa9ac6f2deb52afd23443fb12bf03d391bc2fb9a45047d7ab63eb3f9f53.png
33.3 KB b/‎_images/fc189aa9ac6f2deb52afd23443fb12bf03d391bc2fb9a45047d7ab63eb3f9f53.png
33.3 KB
diff --git a/‎_images/fdb6936df604800124c4de30e7c67decbac89ca2b83eeee613f0f399347f0f2e.png
-199 KB b/‎_images/fdb6936df604800124c4de30e7c67decbac89ca2b83eeee613f0f399347f0f2e.png
-199 KB
diff --git a/‎_images/fe5f39c3e3b016c57917979a41ea6f0275dca463858a59186302fbe7c155925f.png
-45.4 KB b/‎_images/fe5f39c3e3b016c57917979a41ea6f0275dca463858a59186302fbe7c155925f.png
-45.4 KB
diff --git a/‎_sources/python_scripts/cross_validation_grouping.py
+47-29 b/‎_sources/python_scripts/cross_validation_grouping.py
+47-29
@@ -7,9 +7,8 @@
 
 # %% [markdown]
 # # Sample grouping
-# We are going to linger into the concept of sample groups. As in the previous
-# section, we will give an example to highlight some surprising results. This
-# time, we will use the handwritten digits dataset.
+# In this notebook we present the concept of **sample groups**. We use the
+# handwritten digits dataset to highlight some surprising results.
 
 # %%
 from sklearn.datasets import load_digits
@@ -18,8 +17,17 @@
 data, target = digits.data, digits.target
 
 # %% [markdown]
-# We will recreate the same model used in the previous notebook: a logistic
-# regression classifier with a preprocessor to scale the data.
+# We create a model consisting of a logistic regression classifier with a
+# preprocessor to scale the data.
+#
+# ```{note}
+# Here we use a `MinMaxScaler` as we know that each pixel's gray-scale is
+# strictly bounded between 0 (white) and 16 (black). This makes `MinMaxScaler`
+# more suited in this case than `StandardScaler`, as some pixels consistently
+# have low variance (pixels at the borders might almost always be zero if most
+# digits are centered in the image). Then, using `StandardScaler` can result in
+# a very high scaled value due to division by a small number.
+# ```
 
 # %%
 from sklearn.preprocessing import MinMaxScaler
@@ -29,8 +37,10 @@
 model = make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=1_000))
 
 # %% [markdown]
-# We will use the same baseline model. We will use a `KFold` cross-validation
-# without shuffling the data at first.
+# The idea is to compare the estimated generalization performance using
+# different cross-validation techniques and see how such estimations are
+# impacted by underlying data structures. We first use a `KFold`
+# cross-validation without shuffling the data.
 
 # %%
 from sklearn.model_selection import cross_val_score, KFold
@@ -59,9 +69,9 @@
 )
 
 # %% [markdown]
-# We observe that shuffling the data improves the mean accuracy. We could go a
-# little further and plot the distribution of the testing score. We can first
-# concatenate the test scores.
+# We observe that shuffling the data improves the mean accuracy. We can go a
+# little further and plot the distribution of the testing score. For such
+# purpose we concatenate the test scores.
 
 # %%
 import pandas as pd
@@ -72,29 +82,29 @@
 ).T
 
 # %% [markdown]
-# Let's plot the distribution now.
+# Let's now plot the score distributions.
 
 # %%
 import matplotlib.pyplot as plt
 
-all_scores.plot.hist(bins=10, edgecolor="black", alpha=0.7)
+all_scores.plot.hist(bins=16, edgecolor="black", alpha=0.7)
 plt.xlim([0.8, 1.0])
 plt.xlabel("Accuracy score")
 plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left")
 _ = plt.title("Distribution of the test scores")
 
 # %% [markdown]
-# The cross-validation testing error that uses the shuffling has less variance
-# than the one that does not impose any shuffling. It means that some specific
-# fold leads to a low score in this case.
+# Shuffling the data results in a higher cross-validated test accuracy with less
+# variance compared to when the data is not shuffled. It means that some
+# specific fold leads to a low score in this case.
 
 # %%
 print(test_score_no_shuffling)
 
 # %% [markdown]
-# Thus, there is an underlying structure in the data that shuffling will break
-# and get better results. To get a better understanding, we should read the
-# documentation shipped with the dataset.
+# Thus, shuffling the data breaks the underlying structure and thus makes the
+# classification task easier to our model. To get a better understanding, we can
+# read the dataset description in more detail:
 
 # %%
 print(digits.DESCR)
@@ -165,7 +175,7 @@
     groups[lb:up] = group_id
 
 # %% [markdown]
-# We can check the grouping by plotting the indices linked to writer ids.
+# We can check the grouping by plotting the indices linked to writers' ids.
 
 # %%
 plt.plot(groups)
@@ -176,8 +186,9 @@
 _ = plt.title("Underlying writer groups existing in the target")
 
 # %% [markdown]
-# Once we group the digits by writer, we can use cross-validation to take this
-# information into account: the class containing `Group` should be used.
+# Once we group the digits by writer, we can incorporate this information into
+# the cross-validation process by using group-aware variations of the strategies
+# we have explored in this course, for example, the `GroupKFold` strategy.
 
 # %%
 from sklearn.model_selection import GroupKFold
@@ -191,10 +202,12 @@
 )
 
 # %% [markdown]
-# We see that this strategy is less optimistic regarding the model
-# generalization performance. However, this is the most reliable if our goal is
-# to make handwritten digits recognition writers independent. Besides, we can as
-# well see that the standard deviation was reduced.
+# We see that this strategy leads to a lower generalization performance than the
+# other two techniques. However, this is the most reliable estimate if our goal
+# is to evaluate the capabilities of the model to generalize to new unseen
+# writers. In this sense, shuffling the dataset (or alternatively using the
+# writers' ids as a new feature) would lead the model to memorize the different
+# writer's particular handwriting.
 
 # %%
 all_scores = pd.DataFrame(
@@ -207,13 +220,18 @@
 ).T
 
 # %%
-all_scores.plot.hist(bins=10, edgecolor="black", alpha=0.7)
+all_scores.plot.hist(bins=16, edgecolor="black", alpha=0.7)
 plt.xlim([0.8, 1.0])
 plt.xlabel("Accuracy score")
 plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left")
 _ = plt.title("Distribution of the test scores")
 
 # %% [markdown]
-# As a conclusion, it is really important to take any sample grouping pattern
-# into account when evaluating a model. Otherwise, the results obtained will be
-# over-optimistic in regards with reality.
+# In conclusion, accounting for any sample grouping patterns is crucial when
+# assessing a model’s ability to generalize to new groups. Without this
+# consideration, the results may appear overly optimistic compared to the actual
+# performance.
+#
+# The interested reader can learn about other group-aware cross-validation
+# techniques in the [scikit-learn user
+# guide](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-for-grouped-data).