Skip to content

Commit

Permalink
new strat_num for imputer
Browse files Browse the repository at this point in the history
  • Loading branch information
tvdboom committed Jan 31, 2024
1 parent f96b802 commit 9bf0c8f
Show file tree
Hide file tree
Showing 24 changed files with 531 additions and 422 deletions.
9 changes: 5 additions & 4 deletions atom/atom.py
Original file line number Diff line number Diff line change
Expand Up @@ -1699,14 +1699,15 @@ def impute(
Impute or remove missing values according to the selected
strategy. Also removes rows and columns with too many missing
values. Use the `missing` attribute to customize what are
considered "missing values".
values.
See the [Imputer][] class for a description of the parameters.
!!! tip
Use the [nans][self-nans] attribute to check the amount of
missing values per column.
- Use the [nans][self-nans] attribute to check the amount of
missing values per column.
- Use the [`missing`][self-missing] attribute to customize
what are considered "missing values".
"""
columns = kwargs.pop("columns", None)
Expand Down
6 changes: 3 additions & 3 deletions atom/basemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def fullname(self) -> str:
"""Return the model's class name."""
return self.__class__.__name__

@property
@cached_property
def _est_class(self) -> type[Predictor]:
"""Return the estimator's class (not instance).
Expand Down Expand Up @@ -698,8 +698,8 @@ def _get_pred(
method=method_caller,
)

except ValueError as ex:
# Fails for models that don't allow in-sample predictions
except (ValueError, NotImplementedError) as ex:
# Can fail for models that don't allow in-sample predictions
self._log(
f"Failed to get predictions for model {self.name} "
f"on rows {rows}. Returning NaN. Exception: {ex}.", 3
Expand Down
16 changes: 15 additions & 1 deletion atom/basetransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from joblib.memory import Memory
from pandas._typing import Axes

Check notice on line 33 in atom/basetransformer.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _typing of a class
from ray.util.joblib import register_ray
from sklearn.base import OneToOneFeatureMixin
from sklearn.utils.validation import check_memory

from atom.utils.types import (
Expand All @@ -40,7 +41,9 @@
Pandas, Sequence, Severity, Verbose, Warnings, XSelector, YSelector,
bool_t, dataframe_t, int_t, sequence_t,
)
from atom.utils.utils import crash, flt, lst, n_cols, to_df, to_pandas
from atom.utils.utils import (
crash, flt, lst, n_cols, to_df, to_pandas, wrap_fit,
)


T_Estimator = TypeVar("T_Estimator", bound=Estimator)
Expand Down Expand Up @@ -373,6 +376,11 @@ def _inherit(self, obj: T_Estimator, fixed: tuple[str, ...] = ()) -> T_Estimator
to that of this instance. If `obj` is a meta-estimator, it
also adjusts the parameters of the base estimator.
Additionally, the `fit` method of non-sklearn objects is wrapped
to always add the `n_features_in_` and `feature_names_in_`
attributes, and the `get-feature_names_out` method is added to
transformers that don't have it already.
Parameters
----------
obj: Estimator
Expand All @@ -398,6 +406,12 @@ def _inherit(self, obj: T_Estimator, fixed: tuple[str, ...] = ()) -> T_Estimator
else:
obj.set_params(**{p: lst(self._config.sp.sp)[0]})

if hasattr(obj, "fit") and "sklearn" not in obj.__module__:
obj.__class__.fit = wrap_fit(obj.__class__.fit) # type: ignore[method-assign]
if hasattr(obj, "transform") and not hasattr(obj, "get_feature_names_out"):
# We assume here that the transformer does not create nor remove columns
obj.__class__.get_feature_names_out = OneToOneFeatureMixin.get_feature_names_out

return obj

def _get_est_class(self, name: str, module: str) -> type[Estimator]:
Expand Down
93 changes: 57 additions & 36 deletions atom/data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,19 @@
from sktime.transformations.series.detrend import (
ConditionalDeseasonalizer, Deseasonalizer, Detrender,
)
from sktime.transformations.series.impute import Imputer as sktimeImputer
from typing_extensions import Self

from atom.basetransformer import BaseTransformer
from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING
from atom.utils.patches import wrap_method_output
from atom.utils.types import (
Bins, Bool, CategoricalStrats, DataFrame, DiscretizerStrats, Engine,
Estimator, FloatLargerZero, IntLargerEqualZero, IntLargerTwo,
IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, Pandas, Predictor,
PrunerStrats, Scalar, ScalerStrats, SeasonalityModels, Sequence, Series,
Transformer, Verbose, XSelector, YSelector, dataframe_t, sequence_t,
series_t,
EngineTuple, Estimator, FloatLargerZero, Int, IntLargerEqualZero,
IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats,
Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels,
Sequence, Series, Transformer, Verbose, XSelector, YSelector, dataframe_t,
sequence_t, series_t,
)
from atom.utils.utils import (
Goal, bk, check_is_fitted, composed, crash, get_col_order, get_cols, it,
Expand Down Expand Up @@ -92,6 +93,17 @@ def __init_subclass__(cls, **kwargs):
with patch("sklearn.utils._set_output._wrap_method_output", wrap_method_output):
super().__init_subclass__(**kwargs)

def __repr__(self, N_CHAR_MAX: Int = 700) -> str:

Check notice on line 96 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

PEP 8 naming convention violation

Argument name should be lowercase
"""Drop named tuples if default parameters from string representation."""
out = super().__repr__(N_CHAR_MAX)

# Remove default engine for cleaner representation
if hasattr(self, "engine") and self.engine == EngineTuple():
out = re.sub(r"engine=EngineTuple\(data='numpy', estimator='sklearn'\)", "", out)
out = re.sub(r"((?<=\(),\s|,\s(?=\))|,\s(?=,\s))", "", out) # Drop comma-spaces

return out

def __sklearn_clone__(self: T) -> T:
"""Wrap cloning method to attach internal attributes."""
cloned = _clone_parametrized(self)
Expand Down Expand Up @@ -1521,10 +1533,6 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:

return labels

Xt, yt = self._check_input(X, y)
self._check_feature_names(Xt, reset=True)
self._check_n_features(Xt, reset=True)

self._estimators: dict[str, Estimator] = {}

Check notice on line 1536 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

An instance attribute is defined outside `__init__`

Instance attribute _estimators defined outside __init__
self._labels: dict[str, Sequence[str]] = {}

Check notice on line 1537 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

An instance attribute is defined outside `__init__`

Instance attribute _labels defined outside __init__

Expand All @@ -1548,7 +1556,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
raise ValueError(
"Invalid value for the bins parameter. The length of the "
"bins does not match the length of the columns, got len"
f"(bins)={len(bins_c)} and len(columns)={Xt.shape[1]}."
f"(bins)={len(bins_c)} and len(columns)={X.shape[1]}."
) from None
else:
bins_x = bins_c
Expand All @@ -1566,7 +1574,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
encode="ordinal",
strategy=self.strategy,
**kwargs,
).fit(Xt[[col]])
).fit(X[[col]])

# Save labels for transform method
self._labels[col] = get_labels(
Expand All @@ -1592,7 +1600,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
self._estimators[col] = FunctionTransformer(
func=bk.cut,
kw_args={"bins": bins_c, "labels": get_labels(col, bins_c)},
).fit(Xt[[col]])
).fit(X[[col]])

return self

Expand Down Expand Up @@ -2021,7 +2029,7 @@ class Imputer(TransformerMixin, _SetOutputMixin):
Impute or remove missing values according to the selected strategy.
Also removes rows and columns with too many missing values. Use
the `missing` attribute to customize what are considered "missing
the `missing_` attribute to customize what are considered "missing
values".
This class can be accessed from atom through the [impute]
Expand All @@ -2036,9 +2044,18 @@ class Imputer(TransformerMixin, _SetOutputMixin):
- "drop": Drop rows containing missing values.
- "mean": Impute with mean of column.
- "median": Impute with median of column.
- "most_frequent": Impute with the most frequent value.
- "knn": Impute using a K-Nearest Neighbors approach.
- "iterative": Impute using a multivariate imputer.
- "most_frequent": Impute with the most frequent value.
- "drift": Impute values using a [PolynomialTrend][] model.
- "linear": Impute using linear interpolation.
- "nearest": Impute with nearest value.
- "bfill": Impute by using the next valid observation to fill
the gap.
- "ffill": Impute by propagating the last valid observation
to next valid.
- "random": Impute with random values between the min and max
of column.
- int or float: Impute with provided numerical value.
strat_cat: str, default="drop"
Expand Down Expand Up @@ -2263,6 +2280,15 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
num_imputer = IterativeImputer(random_state=self.random_state)
elif self.strat_num == "drop":
num_imputer = "passthrough"
else:
# Inherit sklearn's attributes and methods
num_imputer = self._inherit(
sktimeImputer(
method=self.strat_num,
missing_values=[pd.NA],
random_state=self.random_state,
)
)
else:
num_imputer = SimpleImputer(
missing_values=pd.NA,
Expand Down Expand Up @@ -2401,8 +2427,7 @@ def transform(
if name not in self._estimator.feature_names_in_:
self._log(
f" --> Dropping feature {name}. Contains {nans} "
f"({nans * 100 // len(X)}%) missing values.",
2,
f"({nans * 100 // len(X)}%) missing values.", 2,
)
X = X.drop(columns=name)

Check notice on line 2432 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

PEP 8 naming convention violation

Variable in function should be lowercase
continue
Expand All @@ -2411,34 +2436,34 @@ def transform(
if not isinstance(self.strat_num, str):
self._log(
f" --> Imputing {nans} missing values with "
f"number '{self.strat_num}' in feature {name}.",
2,
f"number '{self.strat_num}' in column {name}.", 2,
)
elif self.strat_num in ("knn", "iterative"):
self._log(
f" --> Imputing {nans} missing values using "
f"the {self.strat_num} imputer in feature {name}.",
2,
f"the {self.strat_num} imputer in column {name}.", 2,
)
elif self.strat_num in ("mean", "median", "most_frequent"):
self._log(
f" --> Imputing {nans} missing values with {self.strat_num} "
f"({np.round(get_stat(num_imputer, name), 2)}) in column "
f"{name}.", 2,
)
elif self.strat_num != "drop": # mean, median or most_frequent
else:
self._log(
f" --> Imputing {nans} missing values with {self.strat_num} "
f"({np.round(get_stat(num_imputer, name), 2)}) in feature "
f"{name}.",
2,
f"in column {name}.", 2,
)
elif self.strat_cat != "drop" and name in cat_imputer.feature_names_in_:
if self.strat_cat == "most_frequent":
self._log(
f" --> Imputing {nans} missing values with most_frequent "
f"({get_stat(cat_imputer, name)}) in feature {name}.",
2,
f"({get_stat(cat_imputer, name)}) in column {name}.", 2,
)
elif self.strat_cat != "drop":
self._log(
f" --> Imputing {nans} missing values with value "
f"'{self.strat_cat}' in feature {name}.",
2,
f"'{self.strat_cat}' in column {name}.", 2,
)

Xt = self._estimator.transform(X)

Check notice on line 2469 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

PEP 8 naming convention violation

Variable in function should be lowercase
Expand Down Expand Up @@ -2969,8 +2994,7 @@ def transform(
cond = np.abs(z_scores) > self.max_sigma
objective = objective.mask(cond, self.method)
self._log(
f" --> Replacing {cond.sum()} outlier values with {self.method}.",
2,
f" --> Replacing {cond.sum()} outlier values with {self.method}.", 2,
)

elif self.method.lower() == "minmax":
Expand All @@ -2992,8 +3016,7 @@ def transform(

self._log(
f" --> Replacing {counts} outlier values "
"with the min or max of the column.",
2,
"with the min or max of the column.", 2,
)

elif self.method.lower() == "drop":
Expand All @@ -3002,8 +3025,7 @@ def transform(
if len(lst(self.strategy)) > 1:
self._log(
f" --> The zscore strategy detected "
f"{len(mask) - sum(mask)} outliers.",
2,
f"{len(mask) - sum(mask)} outliers.", 2,
)

else:
Expand All @@ -3013,8 +3035,7 @@ def transform(
if len(lst(self.strategy)) > 1:
self._log(
f" --> The {estimator.__class__.__name__} "
f"detected {len(mask) - sum(mask)} outliers.",
2,
f"detected {len(mask) - sum(mask)} outliers.", 2,
)

# Add the estimator as attribute to the instance
Expand Down
6 changes: 2 additions & 4 deletions atom/feature_engineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -1540,8 +1540,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log(
f" --> Dropping feature {column} "
f"(score: {self.univariate_.scores_[n]:.2f} "
f"p-value: {self.univariate_.pvalues_[n]:.2f}).",
2,
f"p-value: {self.univariate_.pvalues_[n]:.2f}).", 2,
)
X = X.drop(columns=column)

Check notice on line 1545 in atom/feature_engineering.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

PEP 8 naming convention violation

Variable in function should be lowercase

Expand Down Expand Up @@ -1570,8 +1569,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
if hasattr(self._estimator, "ranking_"):
self._log(
f" --> Dropping feature {column} "
f"(rank {self._estimator.ranking_[n]}).",
2,
f"(rank {self._estimator.ranking_[n]}).", 2,
)
else:
self._log(f" --> Dropping feature {column}.", 2)
Expand Down
Loading

0 comments on commit 9bf0c8f

Please sign in to comment.