scikit-learn-contrib · JulienRoussel77 · Aug 30, 2025 · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,7 +2,12 @@
 History
 =======
 
-0.1.8 (2024-08-29)
+0.1.10 (2024-??-??)
+------------------
+* Long EM and RPCA operations wrapped with tqdm progress bars
+* Readme code sample updated, and results table made consistant
+
+0.1.9 (2024-08-29)
 ------------------
 * Tutorials reproducibility improved with random_state parameters
 * RPCA now accepts random_state parameters

diff --git a/README.rst b/README.rst
@@ -70,26 +70,26 @@ With just these few lines of code, you can see how easy it is to
   from qolmat.utils import data
 
   # load and prepare csv data
+
   df_data = data.get_data("Beijing")
   columns = ["TEMP", "PRES", "WSPM"]
   df_data = df_data[columns]
   df_with_nan = data.add_holes(df_data, ratio_masked=0.2, mean_size=120)
 
   # impute and compare
-  imputer_mean = imputers.ImputerSimple(strategy="mean", groups=("station",))
+  imputer_median = imputers.ImputerSimple(groups=("station",))
   imputer_interpol = imputers.ImputerInterpolation(method="linear", groups=("station",))
   imputer_var1 = imputers.ImputerEM(model="VAR", groups=("station",), method="mle", max_iter_em=50, n_iter_ou=15, dt=1e-3, p=1)
   dict_imputers = {
-        "mean": imputer_mean,
+        "median": imputer_median,
         "interpolation": imputer_interpol,
         "VAR(1) process": imputer_var1
     }
   generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=4, ratio_masked=0.1)
   comparison = comparator.Comparator(
         dict_imputers,
-        columns,
         generator_holes = generator_holes,
-        metrics = ["mae", "wmape", "kl_columnwise", "ks_test", "energy"],
+        metrics = ["mae", "wmape", "kl_columnwise", "frechet"],
     )
   results = comparison.compare(df_with_nan)
   results.style.highlight_min(color="lightsteelblue", axis=1)

diff --git a/docs/images/readme_tabular_comparison.png b/docs/images/readme_tabular_comparison.png
diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py
@@ -128,7 +128,6 @@
 
 comparison = comparator.Comparator(
     dict_imputers,
-    cols_to_impute,
     generator_holes=generator_holes,
     metrics=["mae", "wmape", "kl_columnwise", "wasserstein_columnwise"],
     max_evals=10,

diff --git a/examples/tutorials/plot_tuto_categorical.py b/examples/tutorials/plot_tuto_categorical.py
@@ -89,7 +89,6 @@
 
 comparison = comparator.Comparator(
     dict_imputers,
-    cols_to_impute,
     generator_holes=generator_holes,
     metrics=metrics,
     max_evals=2,

diff --git a/examples/tutorials/plot_tuto_diffusion_models.py b/examples/tutorials/plot_tuto_diffusion_models.py
@@ -169,7 +169,6 @@
 
 comparison = comparator.Comparator(
     dict_imputers,
-    selected_columns=df_data.columns,
     generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng),
     metrics=["mae", "kl_columnwise"],
 )
@@ -224,7 +223,6 @@
 
 comparison = comparator.Comparator(
     dict_imputers,
-    selected_columns=df_data.columns,
     generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng),
     metrics=["mae", "kl_columnwise"],
 )

diff --git a/examples/tutorials/plot_tuto_mean_median.py b/examples/tutorials/plot_tuto_mean_median.py
@@ -123,7 +123,6 @@
 
 comparison = comparator.Comparator(
     dict_imputers,
-    cols_to_impute,
     generator_holes=generator_holes,
     metrics=metrics,
     max_evals=5,

diff --git a/pyproject.toml b/pyproject.toml
@@ -45,6 +45,7 @@ statsmodels = ">= 0.14.0"
 typed-ast = { version = "*", optional = true }
 category-encoders = "^2.6.3"
 dcor = ">= 0.6"
+tqdm = "*"
 
 [tool.poetry.group.torch.dependencies]
 torch = "< 2.5"

diff --git a/qolmat/benchmark/comparator.py b/qolmat/benchmark/comparator.py
@@ -28,9 +28,6 @@ class Comparator:
     ----------
     dict_models: Dict[str, any]
         dictionary of imputation methods
-    selected_columns: List[str]Œ
-        list of column's names selected (all with at least one null value will
-        be imputed)
     columnwise_evaluation : Optional[bool], optional
         whether the metric should be calculated column-wise or not,
         by default False
@@ -46,7 +43,6 @@ class Comparator:
     def __init__(
         self,
         dict_models: Dict[str, Any],
-        selected_columns: List[str],
         generator_holes: _HoleGenerator,
         metrics: List = ["mae", "wmape", "kl_columnwise"],
         dict_config_opti: Optional[Dict[str, Any]] = {},
@@ -55,7 +51,6 @@ def __init__(
         verbose: bool = False,
     ):
         self.dict_imputers = dict_models
-        self.selected_columns = selected_columns
         self.generator_holes = generator_holes
         self.metrics = metrics
         self.dict_config_opti = dict_config_opti

diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -835,6 +835,7 @@ def sum_pairwise_distances(
 def frechet_distance_base(
     df1: pd.DataFrame,
     df2: pd.DataFrame,
+    df_mask: pd.DataFrame,
 ) -> pd.Series:
     """Compute the Fréchet distance between two dataframes df1 and df2.
 
@@ -853,16 +854,24 @@ def frechet_distance_base(
         true dataframe
     df2 : pd.DataFrame
         predicted dataframe
+    df_mask : pd.DataFrame
+        Elements of the dataframes to compute on
 
     Returns
     -------
     pd.Series
         Frechet distance in a Series object
 
     """
-    if df1.shape != df2.shape:
+    if df1.shape != df2.shape or df1.shape != df_mask.shape:
         raise Exception("inputs have to be of same dimensions.")
 
+    df1 = df1.copy()
+    df2 = df2.copy()
+    # Set to nan the values not in the mask
+    df1[~df_mask] = np.nan
+    df2[~df_mask] = np.nan
+
     std = (np.std(df1) + np.std(df2) + EPS) / 2
     mu = (np.nanmean(df1, axis=0) + np.nanmean(df2, axis=0)) / 2
     df1 = (df1 - mu) / std
@@ -911,7 +920,7 @@ def frechet_distance(
 
     """
     if method == "single":
-        return frechet_distance_base(df1, df2)
+        return frechet_distance_base(df1, df2, df_mask)
     return pattern_based_weighted_mean_metric(
         df1,
         df2,

diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py
@@ -11,6 +11,7 @@
 from scipy import optimize as spo
 from sklearn import utils as sku
 from sklearn.base import BaseEstimator, TransformerMixin
+from tqdm import tqdm
 
 from qolmat.utils import utils
 from qolmat.utils.utils import RandomSetting
@@ -433,7 +434,11 @@ def fit_X(self, X: NDArray) -> None:
 
         X = self._maximize_likelihood(X_imp, mask_na)
 
-        for iter_em in range(self.max_iter_em):
+        for iter_em in tqdm(
+            range(self.max_iter_em),
+            desc="EM parameters estimation",
+            disable=not self.verbose,
+        ):
             X = self._sample_ou(X, mask_na)
 
             self.combine_parameters()
@@ -474,6 +479,7 @@ def fit(self, X: NDArray) -> "EM":
         if hasattr(self, "p_to_fit") and self.p_to_fit:
             aics: List[float] = []
             for p in range(self.max_lagp + 1):
+                print("p=", p)
                 self.p = p
                 self.fit_X(X)
                 n1, n2 = self.X.shape

diff --git a/qolmat/imputations/imputers_pytorch.py b/qolmat/imputations/imputers_pytorch.py
@@ -8,6 +8,7 @@
 import pandas as pd
 from numpy.typing import NDArray
 from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
 
 # from typing_extensions import Self
 from qolmat.benchmark import metrics
@@ -106,23 +107,29 @@ def _fit_estimator(
         optimizer = optim.Adam(estimator.parameters(), lr=self.learning_rate)
         loss_fn = self.loss_fn
 
-        for epoch in range(self.epochs):
-            estimator.train()
-            optimizer.zero_grad()
-
-            input_data = torch.Tensor(X.values)
-            target_data = torch.Tensor(y.values)
-            target_data = target_data.unsqueeze(1)
-            outputs = estimator(input_data)
-            loss = loss_fn(outputs, target_data)
-
-            loss.backward()
-            optimizer.step()
-            if (epoch + 1) % 10 == 0:
-                logging.info(
-                    f"Epoch [{epoch + 1}/{self.epochs}], "
-                    f"Loss: {loss.item():.4f}"
-                )
+        # if X.shape[0] != estimator[0].in_features:
+        #     raise ValueError(
+        #         "The number of features in X does not match the input "
+        #         "features of the estimator. The estimator expects"
+        #         f" {estimator[0].in_features} features, but X has "
+        #         f"{X.shape[0]} features."
+        #     )
+
+        with tqdm(total=self.epochs, desc="Training", unit="epoch") as pbar:
+            for _ in range(self.epochs):
+                estimator.train()
+                optimizer.zero_grad()
+
+                input_data = torch.Tensor(X.values)
+                target_data = torch.Tensor(y.values)
+                target_data = target_data.unsqueeze(1)
+                outputs = estimator(input_data)
+                loss = loss_fn(outputs, target_data)
+
+                loss.backward()
+                optimizer.step()
+                pbar.set_postfix(loss=f"{loss.item():.4f}")
+                pbar.update(1)
         return estimator
 
     def _predict_estimator(