Skip to content

Commit

Permalink
refactor imputer
Browse files Browse the repository at this point in the history
  • Loading branch information
tvdboom committed Nov 26, 2023
1 parent dfbb0d9 commit 3b8e4a8
Show file tree
Hide file tree
Showing 13 changed files with 175 additions and 153 deletions.
2 changes: 0 additions & 2 deletions atom/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@
"""

import pandas as pd
import sklearn

from atom.api import ATOMClassifier, ATOMForecaster, ATOMModel, ATOMRegressor
from atom.utils.constants import __version__


pd.options.mode.copy_on_write = True
sklearn.set_config(transform_output="pandas")
16 changes: 8 additions & 8 deletions atom/atom.py
Original file line number Diff line number Diff line change
Expand Up @@ -1266,14 +1266,14 @@ def add(
If the transform method doesn't return a dataframe:
* The column naming happens as follows. If the transformer
has a `get_feature_names_out` or `get_feature_names`
method, it is used. If not, and it returns the same number
of columns, the names are kept equal. If the number of
columns change, old columns will keep their name (as long
as the column is unchanged) and new columns will receive
the name `x[N-1]`, where N stands for the n-th feature.
This means that a transformer should only transform, add
or drop columns, not combinations of these.
has a `get_feature_names_out` method, it is used. If not,
and it returns the same number of columns, the names are
kept equal. If the number of columns changes, old columns
will keep their name (as long as the column is unchanged)
and new columns will receive the name `x[N-1]`, where N
stands for the n-th feature. This means that a transformer
should only transform, add or drop columns, not
combinations of these.
* The index remains the same as before the transformation.
This means that the transformer should not add, remove or
shuffle rows unless it returns a dataframe.
Expand Down
156 changes: 76 additions & 80 deletions atom/data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
)
from scipy.stats import zscore
from sklearn.base import BaseEstimator, _clone_parametrized

Check notice on line 39 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _clone_parametrized of a class
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from typing_extensions import Self
Expand All @@ -52,8 +53,8 @@
dataframe_t, sequence_t, series_t,
)
from atom.utils.utils import (
bk, check_is_fitted, composed, crash, get_cols, it, lst, merge,
method_to_log, n_cols, replace_missing, sign, to_df, to_series,
bk, check_is_fitted, composed, crash, get_col_order, get_cols, it, lst,
merge, method_to_log, n_cols, replace_missing, sign, to_df, to_series,
variable_return, wrap_methods,
)

Expand Down Expand Up @@ -1375,6 +1376,13 @@ class Encoder(TransformerMixin):
Value with which to replace rare classes. This parameter is
ignored if `infrequent_to_value=None`.
n_jobs: int, default=1
Number of cores to use for parallel processing.
- If >0: Number of cores to use.
- If -1: Use all available cores.
- If <-1: Use number of cores - 1 - value.
verbose: int, default=0
Verbosity level of the class. Choose from:
Expand All @@ -1394,8 +1402,7 @@ class Encoder(TransformerMixin):
----------
mapping_: dict of dicts
Encoded values and their respective mapping. The column name is
the key to its mapping dictionary. Only for columns mapped to a
single column (e.g., Ordinal, Leave-one-out, etc...).
the key to its mapping dictionary. Only for ordinal encoding.
feature_names_in_: np.ndarray
Names of features seen during fit.
Expand Down Expand Up @@ -1461,11 +1468,12 @@ def __init__(
ordinal: dict[str, Sequence[Any]] | None = None,
infrequent_to_value: FloatLargerZero | None = None,
value: str = "infrequent",
n_jobs: NJobs = 1,
verbose: Verbose = 0,
logger: str | Path | Logger | None = None,
**kwargs,
):
super().__init__(verbose=verbose, logger=logger)
super().__init__(n_jobs=n_jobs, verbose=verbose, logger=logger)
self.strategy = strategy
self.max_onehot = max_onehot
self.ordinal = ordinal
Expand Down Expand Up @@ -1504,11 +1512,9 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
Estimator instance.
"""
self.mapping_: dict[str, dict[Hashable, Scalar]] = defaultdict(dict)
self._to_value = defaultdict(list)
self.mapping_ = {}

Check notice on line 1515 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

An instance attribute is defined outside `__init__`

Instance attribute mapping_ defined outside __init__
self._to_value = {}

Check notice on line 1516 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

An instance attribute is defined outside `__init__`

Instance attribute _to_value defined outside __init__
self._categories = {}

Check notice on line 1517 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

An instance attribute is defined outside `__init__`

Instance attribute _categories defined outside __init__
self._encoders = {}
self._cat_cols = list(X.select_dtypes(exclude="number").columns)

strategies = dict(
backwarddifference=BackwardDifferenceEncoder,
Expand Down Expand Up @@ -1555,13 +1561,14 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:

self._log("Fitting Encoder...", 1)

for name, column in X[self._cat_cols].items():
encoders: dict[str, list[str]] = defaultdict(list)

for name, column in X.select_dtypes(include=CAT_TYPES).items():
# Replace infrequent classes with the string in `value`
if self.infrequent_to_value:
for category, count in column.value_counts().items():
if count <= infrequent_to_value:
self._to_value[name].append(category)
X[name] = column.replace(category, self.value) # type: ignore
values = column.value_counts()
self._to_value[name] = values[values <= infrequent_to_value].index.tolist()

Check warning on line 1570 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Unbound local variables

Local variable 'infrequent_to_value' might be referenced before assignment
X[name] = column.replace(self._to_value[name], self.value)

# Get the unique categories before fitting
self._categories[name] = column.dropna().sort_values().unique().tolist()
Expand All @@ -1584,47 +1591,43 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
mapping.setdefault(np.NaN, -1) # Encoder always needs mapping of NaN
self.mapping_[str(name)] = mapping

self._encoders[name] = OrdinalEncoder(
mapping=[{"col": name, "mapping": mapping}],
cols=[name], # Specify to not skip bool columns
handle_missing="return_nan",
handle_unknown="value",
).fit(X[[name]])

encoders["ordinal"].append(str(name))
elif 2 < len(self._categories[name]) <= max_onehot:
self._encoders[name] = OneHotEncoder(
cols=[name], # Specify to not skip numerical columns
use_cat_names=True,
handle_missing="return_nan",
handle_unknown="value",
).fit(X[[name]])

encoders["onehot"].append(str(name))
else:
args = [X[[name]]]
if "y" in sign(estimator.fit):
args.append(bk.DataFrame(y).iloc[:, 0])

self._encoders[name] = estimator(
cols=[name],
handle_missing="return_nan",
handle_unknown="value",
**self.kwargs,
).fit(*args)

# Create encoding of unique values for mapping
data = self._encoders[name].transform(
bk.Series(
data=self._categories[name],
index=self._categories[name],
name=name,
dtype="object",
)
)
encoders["rest"].append(str(name))

# Only mapping 1 - 1 column
if data.shape[1] == 1:
for idx, value in data[name].items():
self.mapping_[str(name)][idx] = value
ordinal_enc = OrdinalEncoder(
mapping=[{"col": c, "mapping": self.mapping_[c]} for c in encoders["ordinal"]],
cols=encoders["ordinal"],
handle_missing="return_nan",
handle_unknown="value",
)

onehot_enc = OneHotEncoder(
cols=encoders["onehot"],
use_cat_names=True,
handle_missing="return_nan",
handle_unknown="value",
)

rest_enc = estimator(
cols=encoders["rest"],
handle_missing="return_nan",
handle_unknown="value",
**self.kwargs,
)

self._estimator = ColumnTransformer(

Check notice on line 1621 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

An instance attribute is defined outside `__init__`

Instance attribute _estimator defined outside __init__
transformers=[
("ordinal", ordinal_enc, encoders["ordinal"]),
("onehot", onehot_enc, encoders["onehot"]),
("rest", rest_enc, encoders["rest"]),
],
remainder="passthrough",
n_jobs=self.n_jobs,
verbose_feature_names_out=False,
).fit(X, y)

return self

Expand All @@ -1648,43 +1651,36 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
"""
self._log("Encoding categorical columns...", 1)

for name, column in X[self._cat_cols].items():
# Convert infrequent classes to value
if self._to_value[name]:
X[name] = column.replace(self._to_value[name], self.value)
# Convert infrequent classes to value
X = X.replace(self._to_value, self.value)

Check notice on line 1655 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

PEP 8 naming convention violation

Variable in function should be lowercase

for name, categories in self._categories.items():
if name in self._estimator.transformers_[0][2]:
estimator = self._estimator.transformers_[0][1]
elif name in self._estimator.transformers_[1][2]:
estimator = self._estimator.transformers_[1][1]
else:
estimator = self._estimator.transformers_[2][1]

self._log(
f" --> {self._encoders[name].__class__.__name__[:-7]}-encoding "
f"feature {name}. Contains {column.nunique()} classes.", 2
f" --> {estimator.__class__.__name__[:-7]}-encoding feature "
f"{name}. Contains {X[name].nunique()} classes.", 2
)

# Count the propagated missingX[[name]] values
if n_nans := column.isna().sum():
# Count the propagated missing values
if n_nans := X[name].isna().sum():
self._log(f" --> Propagating {n_nans} missing values.", 2)

# Get the new encoded columns
new_cols = self._encoders[name].transform(X[[name]])

# Drop _nan columns (since missing values are propagated)
new_cols = new_cols.loc[:, ~new_cols.columns.str.endswith("_nan")]

# Check for unknown classes
if uc := len(column.dropna()[~column.isin(self._categories[name])]):
if uc := len(X[name].dropna()[~X[name].isin(categories)]):
self._log(f" --> Handling {uc} unknown classes.", 2)

# Insert the new columns at old location
for i, new_col in enumerate(sorted(new_cols)):
if new_col in X:
X[new_col] = new_cols[new_col].values # Replace existing column
else:
# Drop the original column
if name in X:
idx = X.columns.get_loc(name)
X = X.drop(columns=name)
Xt = self._estimator.transform(X)

Check notice on line 1678 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

PEP 8 naming convention violation

Variable in function should be lowercase

X.insert(idx + i, new_col, new_cols[new_col])
# Drop _nan columns (since missing values are propagated)
Xt = Xt.loc[:, ~Xt.columns.str.endswith("_nan")]

Check notice on line 1681 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

PEP 8 naming convention violation

Variable in function should be lowercase

return X
return Xt[get_col_order(Xt, X.columns.tolist(), self._estimator.feature_names_in_)]


@beartype
Expand Down Expand Up @@ -2318,7 +2314,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log("Normalizing features...", 1)
Xt = self._estimator.transform(X[self._estimator.feature_names_in_])

Check notice on line 2315 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

PEP 8 naming convention violation

Variable in function should be lowercase

X.update(Xt) # Reorder columns to original order
X.update(Xt)

return X[self.feature_names_in_]

Expand Down Expand Up @@ -2862,7 +2858,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log("Scaling features...", 1)
Xt = self._estimator.transform(X[self._estimator.feature_names_in_])

Check notice on line 2859 in atom/data_cleaning.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

PEP 8 naming convention violation

Variable in function should be lowercase

X.update(Xt) # Reorder columns to original order
X.update(Xt)

return X

Expand Down
Loading

0 comments on commit 3b8e4a8

Please sign in to comment.