new strat_num for imputer

tvdboom · Jan 31, 2024 · 9bf0c8f · 9bf0c8f
1 parent f96b802
commit 9bf0c8f
Show file tree

Hide file tree

Showing 24 changed files with 531 additions and 422 deletions.
diff --git a/atom/atom.py b/atom/atom.py
@@ -1699,14 +1699,15 @@ def impute(
 
         Impute or remove missing values according to the selected
         strategy. Also removes rows and columns with too many missing
-        values. Use the `missing` attribute to customize what are
-        considered "missing values".
+        values.
 
         See the [Imputer][] class for a description of the parameters.
 
         !!! tip
-            Use the [nans][self-nans] attribute to check the amount of
-            missing values per column.
+            - Use the [nans][self-nans] attribute to check the amount of
+              missing values per column.
+            - Use the [`missing`][self-missing] attribute to customize
+              what are considered "missing values".
 
         """
         columns = kwargs.pop("columns", None)

diff --git a/atom/basemodel.py b/atom/basemodel.py
@@ -309,7 +309,7 @@ def fullname(self) -> str:
         """Return the model's class name."""
         return self.__class__.__name__
 
-    @property
+    @cached_property
     def _est_class(self) -> type[Predictor]:
         """Return the estimator's class (not instance).
 
@@ -698,8 +698,8 @@ def _get_pred(
                     method=method_caller,
                 )
 
-            except ValueError as ex:
-                # Fails for models that don't allow in-sample predictions
+            except (ValueError, NotImplementedError) as ex:
+                # Can fail for models that don't allow in-sample predictions
                 self._log(
                     f"Failed to get predictions for model {self.name} "
                     f"on rows {rows}. Returning NaN. Exception: {ex}.", 3

diff --git a/atom/basetransformer.py b/atom/basetransformer.py
@@ -32,6 +32,7 @@
 from joblib.memory import Memory
 from pandas._typing import Axes
 from ray.util.joblib import register_ray
+from sklearn.base import OneToOneFeatureMixin
 from sklearn.utils.validation import check_memory
 
 from atom.utils.types import (
@@ -40,7 +41,9 @@
     Pandas, Sequence, Severity, Verbose, Warnings, XSelector, YSelector,
     bool_t, dataframe_t, int_t, sequence_t,
 )
-from atom.utils.utils import crash, flt, lst, n_cols, to_df, to_pandas
+from atom.utils.utils import (
+    crash, flt, lst, n_cols, to_df, to_pandas, wrap_fit,
+)
 
 
 T_Estimator = TypeVar("T_Estimator", bound=Estimator)
@@ -373,6 +376,11 @@ def _inherit(self, obj: T_Estimator, fixed: tuple[str, ...] = ()) -> T_Estimator
         to that of this instance. If `obj` is a meta-estimator, it
         also adjusts the parameters of the base estimator.
 
+        Additionally, the `fit` method of non-sklearn objects is wrapped
+        to always add the `n_features_in_` and `feature_names_in_`
+        attributes, and the `get-feature_names_out` method is added to
+        transformers that don't have it already.
+
         Parameters
         ----------
         obj: Estimator
@@ -398,6 +406,12 @@ def _inherit(self, obj: T_Estimator, fixed: tuple[str, ...] = ()) -> T_Estimator
                 else:
                     obj.set_params(**{p: lst(self._config.sp.sp)[0]})
 
+        if hasattr(obj, "fit") and "sklearn" not in obj.__module__:
+            obj.__class__.fit = wrap_fit(obj.__class__.fit)  # type: ignore[method-assign]
+        if hasattr(obj, "transform") and not hasattr(obj, "get_feature_names_out"):
+            # We assume here that the transformer does not create nor remove columns
+            obj.__class__.get_feature_names_out = OneToOneFeatureMixin.get_feature_names_out
+
         return obj
 
     def _get_est_class(self, name: str, module: str) -> type[Estimator]:

diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py
@@ -46,18 +46,19 @@
 from sktime.transformations.series.detrend import (
     ConditionalDeseasonalizer, Deseasonalizer, Detrender,
 )
+from sktime.transformations.series.impute import Imputer as sktimeImputer
 from typing_extensions import Self
 
 from atom.basetransformer import BaseTransformer
 from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING
 from atom.utils.patches import wrap_method_output
 from atom.utils.types import (
     Bins, Bool, CategoricalStrats, DataFrame, DiscretizerStrats, Engine,
-    Estimator, FloatLargerZero, IntLargerEqualZero, IntLargerTwo,
-    IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, Pandas, Predictor,
-    PrunerStrats, Scalar, ScalerStrats, SeasonalityModels, Sequence, Series,
-    Transformer, Verbose, XSelector, YSelector, dataframe_t, sequence_t,
-    series_t,
+    EngineTuple, Estimator, FloatLargerZero, Int, IntLargerEqualZero,
+    IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats,
+    Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels,
+    Sequence, Series, Transformer, Verbose, XSelector, YSelector, dataframe_t,
+    sequence_t, series_t,
 )
 from atom.utils.utils import (
     Goal, bk, check_is_fitted, composed, crash, get_col_order, get_cols, it,
@@ -92,6 +93,17 @@ def __init_subclass__(cls, **kwargs):
         with patch("sklearn.utils._set_output._wrap_method_output", wrap_method_output):
             super().__init_subclass__(**kwargs)
 
+    def __repr__(self, N_CHAR_MAX: Int = 700) -> str:
+        """Drop named tuples if default parameters from string representation."""
+        out = super().__repr__(N_CHAR_MAX)
+
+        # Remove default engine for cleaner representation
+        if hasattr(self, "engine") and self.engine == EngineTuple():
+            out = re.sub(r"engine=EngineTuple\(data='numpy', estimator='sklearn'\)", "", out)
+            out = re.sub(r"((?<=\(),\s|,\s(?=\))|,\s(?=,\s))", "", out)  # Drop comma-spaces
+
+        return out
+
     def __sklearn_clone__(self: T) -> T:
         """Wrap cloning method to attach internal attributes."""
         cloned = _clone_parametrized(self)
@@ -1521,10 +1533,6 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
 
             return labels
 
-        Xt, yt = self._check_input(X, y)
-        self._check_feature_names(Xt, reset=True)
-        self._check_n_features(Xt, reset=True)
-
         self._estimators: dict[str, Estimator] = {}
         self._labels: dict[str, Sequence[str]] = {}
 
@@ -1548,7 +1556,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
                         raise ValueError(
                             "Invalid value for the bins parameter. The length of the "
                             "bins does not match the length of the columns, got len"
-                            f"(bins)={len(bins_c)} and len(columns)={Xt.shape[1]}."
+                            f"(bins)={len(bins_c)} and len(columns)={X.shape[1]}."
                         ) from None
                 else:
                     bins_x = bins_c
@@ -1566,7 +1574,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
                     encode="ordinal",
                     strategy=self.strategy,
                     **kwargs,
-                ).fit(Xt[[col]])
+                ).fit(X[[col]])
 
                 # Save labels for transform method
                 self._labels[col] = get_labels(
@@ -1592,7 +1600,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
                 self._estimators[col] = FunctionTransformer(
                     func=bk.cut,
                     kw_args={"bins": bins_c, "labels": get_labels(col, bins_c)},
-                ).fit(Xt[[col]])
+                ).fit(X[[col]])
 
         return self
 
@@ -2021,7 +2029,7 @@ class Imputer(TransformerMixin, _SetOutputMixin):
 
     Impute or remove missing values according to the selected strategy.
     Also removes rows and columns with too many missing values. Use
-    the `missing` attribute to customize what are considered "missing
+    the `missing_` attribute to customize what are considered "missing
     values".
 
     This class can be accessed from atom through the [impute]
@@ -2036,9 +2044,18 @@ class Imputer(TransformerMixin, _SetOutputMixin):
         - "drop": Drop rows containing missing values.
         - "mean": Impute with mean of column.
         - "median": Impute with median of column.
+        - "most_frequent": Impute with the most frequent value.
         - "knn": Impute using a K-Nearest Neighbors approach.
         - "iterative": Impute using a multivariate imputer.
-        - "most_frequent": Impute with the most frequent value.
+        - "drift": Impute values using a [PolynomialTrend][] model.
+        - "linear": Impute using linear interpolation.
+        - "nearest": Impute with nearest value.
+        - "bfill": Impute by using the next valid observation to fill
+           the gap.
+        - "ffill": Impute by propagating the last valid observation
+          to next valid.
+        - "random": Impute with random values between the min and max
+           of column.
         - int or float: Impute with provided numerical value.
 
     strat_cat: str, default="drop"
@@ -2263,6 +2280,15 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
                 num_imputer = IterativeImputer(random_state=self.random_state)
             elif self.strat_num == "drop":
                 num_imputer = "passthrough"
+            else:
+                # Inherit sklearn's attributes and methods
+                num_imputer = self._inherit(
+                    sktimeImputer(
+                        method=self.strat_num,
+                        missing_values=[pd.NA],
+                        random_state=self.random_state,
+                    )
+                )
         else:
             num_imputer = SimpleImputer(
                 missing_values=pd.NA,
@@ -2401,8 +2427,7 @@ def transform(
                 if name not in self._estimator.feature_names_in_:
                     self._log(
                         f" --> Dropping feature {name}. Contains {nans} "
-                        f"({nans * 100 // len(X)}%) missing values.",
-                        2,
+                        f"({nans * 100 // len(X)}%) missing values.", 2,
                     )
                     X = X.drop(columns=name)
                     continue
@@ -2411,34 +2436,34 @@ def transform(
                     if not isinstance(self.strat_num, str):
                         self._log(
                             f" --> Imputing {nans} missing values with "
-                            f"number '{self.strat_num}' in feature {name}.",
-                            2,
+                            f"number '{self.strat_num}' in column {name}.", 2,
                         )
                     elif self.strat_num in ("knn", "iterative"):
                         self._log(
                             f" --> Imputing {nans} missing values using "
-                            f"the {self.strat_num} imputer in feature {name}.",
-                            2,
+                            f"the {self.strat_num} imputer in column {name}.", 2,
+                        )
+                    elif self.strat_num in ("mean", "median", "most_frequent"):
+                        self._log(
+                            f" --> Imputing {nans} missing values with {self.strat_num} "
+                            f"({np.round(get_stat(num_imputer, name), 2)}) in column "
+                            f"{name}.", 2,
                         )
-                    elif self.strat_num != "drop":  # mean, median or most_frequent
+                    else:
                         self._log(
                             f" --> Imputing {nans} missing values with {self.strat_num} "
-                            f"({np.round(get_stat(num_imputer, name), 2)}) in feature "
-                            f"{name}.",
-                            2,
+                            f"in column {name}.", 2,
                         )
                 elif self.strat_cat != "drop" and name in cat_imputer.feature_names_in_:
                     if self.strat_cat == "most_frequent":
                         self._log(
                             f" --> Imputing {nans} missing values with most_frequent "
-                            f"({get_stat(cat_imputer, name)}) in feature {name}.",
-                            2,
+                            f"({get_stat(cat_imputer, name)}) in column {name}.", 2,
                         )
                     elif self.strat_cat != "drop":
                         self._log(
                             f" --> Imputing {nans} missing values with value "
-                            f"'{self.strat_cat}' in feature {name}.",
-                            2,
+                            f"'{self.strat_cat}' in column {name}.", 2,
                         )
 
         Xt = self._estimator.transform(X)
@@ -2969,8 +2994,7 @@ def transform(
                     cond = np.abs(z_scores) > self.max_sigma
                     objective = objective.mask(cond, self.method)
                     self._log(
-                        f" --> Replacing {cond.sum()} outlier values with {self.method}.",
-                        2,
+                        f" --> Replacing {cond.sum()} outlier values with {self.method}.", 2,
                     )
 
                 elif self.method.lower() == "minmax":
@@ -2992,8 +3016,7 @@ def transform(
 
                     self._log(
                         f" --> Replacing {counts} outlier values "
-                        "with the min or max of the column.",
-                        2,
+                        "with the min or max of the column.", 2,
                     )
 
                 elif self.method.lower() == "drop":
@@ -3002,8 +3025,7 @@ def transform(
                     if len(lst(self.strategy)) > 1:
                         self._log(
                             f" --> The zscore strategy detected "
-                            f"{len(mask) - sum(mask)} outliers.",
-                            2,
+                            f"{len(mask) - sum(mask)} outliers.", 2,
                         )
 
             else:
@@ -3013,8 +3035,7 @@ def transform(
                 if len(lst(self.strategy)) > 1:
                     self._log(
                         f" --> The {estimator.__class__.__name__} "
-                        f"detected {len(mask) - sum(mask)} outliers.",
-                        2,
+                        f"detected {len(mask) - sum(mask)} outliers.", 2,
                     )
 
                 # Add the estimator as attribute to the instance

diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py
@@ -1540,8 +1540,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
                     self._log(
                         f"   --> Dropping feature {column} "
                         f"(score: {self.univariate_.scores_[n]:.2f}  "
-                        f"p-value: {self.univariate_.pvalues_[n]:.2f}).",
-                        2,
+                        f"p-value: {self.univariate_.pvalues_[n]:.2f}).", 2,
                     )
                     X = X.drop(columns=column)
 
@@ -1570,8 +1569,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
                     if hasattr(self._estimator, "ranking_"):
                         self._log(
                             f"   --> Dropping feature {column} "
-                            f"(rank {self._estimator.ranking_[n]}).",
-                            2,
+                            f"(rank {self._estimator.ranking_[n]}).", 2,
                         )
                     else:
                         self._log(f"   --> Dropping feature {column}.", 2)