diff --git a/atom/_show_versions.py b/atom/_show_versions.py
index ed013853e..56dfcbdc9 100644
--- a/atom/_show_versions.py
+++ b/atom/_show_versions.py
@@ -20,12 +20,11 @@
"atom",
"beartype",
"category_encoders",
- "dagshub",
"dill",
+ "featuretools",
"gplearn",
"imblearn",
"ipywidgets",
- "featuretools",
"joblib",
"matplotlib",
"mlflow",
@@ -35,14 +34,31 @@
"optuna",
"pandas",
"plotly",
- "ray",
- "requests",
"sklearn",
- "sklearnex", # Has no __version__ attribute
"scipy",
"shap",
"sktime",
+ "statsmodels",
"zoofs", # Has no __version__ attribute
+ "botorch",
+ "catboost",
+ "dagshub",
+ "dask[distributed]",
+ "explainerdashboard",
+ "gradio",
+ "lightgbm",
+ "modin[ray]",
+ "polars",
+ "pyarrow",
+ "pyspark",
+ "ray[serve]",
+ "requests",
+ "sklearnex",
+ "schemdraw",
+ "statsforecast",
+ "sweetviz",
+ "wordcloud",
+ "xgboost",
]
diff --git a/atom/api.py b/atom/api.py
index 6bb22eaaf..31dc391c1 100644
--- a/atom/api.py
+++ b/atom/api.py
@@ -158,20 +158,20 @@ class ATOMClassifier(ATOM):
**X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features).
- **y: int, str or sequence**
- Target column corresponding to `X`.
+ **y: int, str, sequence or dataframe-like**
+ Target column(s) corresponding to `X`.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ - If dataframe-like: Target columns for multioutput tasks.
- y: int, str, dict, sequence or dataframe, default=-1
- Target column corresponding to `X`.
+ y: int, str, sequence or dataframe-like, default=-1
+ Target column(s) corresponding to `X`.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput tasks.
- If dataframe: Target columns for multioutput tasks.
@@ -257,9 +257,16 @@ class ATOMClassifier(ATOM):
- "data":
+ - "numpy"
- "pandas" (default)
+ - "pandas-pyarrow"
+ - "polars"
+ - "polars-lazy"
- "pyarrow"
- "modin"
+ - "dask"
+ - "pyspark"
+ - "pyspark-pandas"
- "estimator":
@@ -276,6 +283,7 @@ class ATOMClassifier(ATOM):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -428,24 +436,24 @@ class ATOMForecaster(ATOM):
Exogenous feature set corresponding to y, with shape=(n_samples,
n_features).
- **y: int, str or sequence**
+ **y: int, str, sequence or dataframe-like**
Time series.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ - If dataframe-like: Target columns for multioutput tasks.
- y: int, str, dict, sequence or dataframe, default=-1
+ y: int, str, sequence or dataframe-like, default=-1
Time series.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
+ - If None: `y` is ignored.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ - If dataframe-like: Target columns for multioutput tasks.
This parameter is ignored if the time series is provided
through `arrays`.
@@ -526,9 +534,16 @@ class ATOMForecaster(ATOM):
- "data":
+ - "numpy"
- "pandas" (default)
+ - "pandas-pyarrow"
+ - "polars"
+ - "polars-lazy"
- "pyarrow"
- "modin"
+ - "dask"
+ - "pyspark"
+ - "pyspark-pandas"
- "estimator":
@@ -545,6 +560,7 @@ class ATOMForecaster(ATOM):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -689,24 +705,24 @@ class ATOMRegressor(ATOM):
**X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features).
- **y: int, str or sequence**
- Target column corresponding to `X`.
+ **y: int, str, sequence or dataframe-like**
+ Target column(s) corresponding to `X`.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput tasks.
- If dataframe: Target columns for multioutput tasks.
- y: int, str, dict, sequence or dataframe, default=-1
- Target column corresponding to `X`.
+ y: int, str, sequence or dataframe-like, default=-1
+ Target column(s) corresponding to `X`.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
+ - If None: `y` is ignored.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ - If dataframe-like: Target columns for multioutput tasks.
This parameter is ignored if the target column is provided
through `arrays`.
@@ -775,9 +791,16 @@ class ATOMRegressor(ATOM):
- "data":
+ - "numpy"
- "pandas" (default)
+ - "pandas-pyarrow"
+ - "polars"
+ - "polars-lazy"
- "pyarrow"
- "modin"
+ - "dask"
+ - "pyspark"
+ - "pyspark-pandas"
- "estimator":
@@ -794,6 +817,7 @@ class ATOMRegressor(ATOM):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
diff --git a/atom/atom.py b/atom/atom.py
index 37abdd8ab..8b6c600de 100644
--- a/atom/atom.py
+++ b/atom/atom.py
@@ -27,12 +27,10 @@
from scipy import stats
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.utils.metaestimators import available_if
-from statsmodels.stats.diagnostic import acorr_ljungbox
-from statsmodels.tsa.stattools import adfuller, kpss
from atom.baserunner import BaseRunner
from atom.basetransformer import BaseTransformer
-from atom.branch import Branch, BranchManager
+from atom.data import Branch, BranchManager
from atom.data_cleaning import (
Balancer, Cleaner, Decomposer, Discretizer, Encoder, Imputer, Normalizer,
Pruner, Scaler, TransformerMixin,
@@ -50,22 +48,21 @@
)
from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING, __version__
from atom.utils.types import (
- Backend, Bins, Bool, CategoricalStrats, ColumnSelector, DataFrame,
- DiscretizerStrats, Engine, EngineTuple, Estimator, FeatureNamesOut,
- FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero,
- FloatLargerZero, FloatZeroToOneInc, Index, IndexSelector, Int,
- IntLargerEqualZero, IntLargerTwo, IntLargerZero, MetricConstructor,
- ModelsConstructor, NItems, NJobs, NormalizerStrats, NumericalStrats,
- Operators, Pandas, Predictor, PrunerStrats, RowSelector, Scalar,
- ScalerStrats, Seasonality, Sequence, Series, SPDict, TargetSelector,
- Transformer, VectorizerStarts, Verbose, Warnings, XSelector, YSelector,
- sequence_t,
+ Backend, Bins, Bool, CategoricalStrats, ColumnSelector, DiscretizerStrats,
+ Engine, EngineTuple, Estimator, FeatureNamesOut, FeatureSelectionSolvers,
+ FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero,
+ FloatZeroToOneInc, IndexSelector, Int, IntLargerEqualZero, IntLargerTwo,
+ IntLargerZero, MetricConstructor, ModelsConstructor, NItems, NJobs,
+ NormalizerStrats, NumericalStrats, Operators, Predictor, PrunerStrats,
+ RowSelector, Scalar, ScalerStrats, Seasonality, Sequence, SPDict,
+ TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings, XReturn,
+ XSelector, YReturn, YSelector, sequence_t,
)
from atom.utils.utils import (
- ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, bk,
- check_dependency, check_scaling, composed, crash, fit_one, flt, get_cols,
- get_custom_scorer, has_task, is_sparse, lst, make_sklearn, merge,
- method_to_log, replace_missing, sign, to_pyarrow,
+ ClassMap, DataConfig, DataContainer, Goal, adjust, check_dependency,
+ composed, crash, fit_one, flt, get_cols, get_custom_scorer, has_task,
+ is_sparse, lst, make_sklearn, merge, method_to_log, n_cols,
+ replace_missing, sign,
)
@@ -156,9 +153,8 @@ def __init__(
self._log(f"Parallel processing with {self.n_jobs} cores.", 1)
elif self.backend != "loky":
self._log(
- "Leaving n_jobs=1 ignores all parallelization. Set n_jobs>1 to make use "
- f"of the {self.backend} parallelization backend.",
- 1,
+ "Leaving n_jobs=1 ignores all parallelization. Set n_jobs>1 to "
+ f"make use of the {self.backend} parallelization backend.", 1,
severity="warning",
)
if "cpu" not in self.device.lower():
@@ -167,7 +163,7 @@ def __init__(
self._log(f"Data engine: {self.engine.data}", 1)
if self.engine.estimator != EngineTuple().estimator:
self._log(f"Estimator engine: {self.engine.estimator}", 1)
- if self.backend == "ray" or self.n_jobs > 1:
+ if self.backend != "loky" and self.n_jobs > 1:
self._log(f"Parallelization backend: {self.backend}", 1)
if self.memory.location is not None:
self._log(f"Cache storage: {os.path.join(self.memory.location, 'joblib')}", 1)
@@ -315,27 +311,28 @@ def missing(self, value: Sequence[Any]):
def scaled(self) -> bool:
"""Whether the feature set is scaled.
- A data set is considered scaled when it has mean=0 and std=1,
- or when there is a scaler in the pipeline. Binary columns (only
- zeros and ones) are excluded from the calculation.
+ A data set is considered scaled when it has mean~0 and std~1,
+ or when there is a scaler in the pipeline. Categorical and
+ binary columns (only zeros and ones) are excluded from the
+ calculation.
"""
- return check_scaling(self.X, pipeline=self.pipeline)
+ return self.branch.check_scaling()
@property
- def duplicates(self) -> Int:
+ def duplicates(self) -> int:
"""Number of duplicate rows in the dataset."""
- return self.branch.dataset.duplicated().sum()
+ return int(self.branch.dataset.duplicated().sum())
@property
- def nans(self) -> Series:
+ def nans(self) -> pd.Series:
"""Columns with the number of missing values in them.
This property is unavailable for [sparse datasets][].
"""
- if not is_sparse(self.X):
- return replace_missing(self.X, self.missing).isna().sum()
+ if not is_sparse(self.branch.X):
+ return replace_missing(self.branch.X, self.missing).isna().sum()
raise AttributeError("This property is unavailable for sparse datasets.")
@@ -346,16 +343,16 @@ def n_nans(self) -> int:
This property is unavailable for [sparse datasets][].
"""
- if not is_sparse(self.X):
- nans = replace_missing(self.X, self.missing).isna().sum(axis=1)
+ if not is_sparse(self.branch.X):
+ nans = replace_missing(self.branch.X, self.missing).isna().sum(axis=1)
return len(nans[nans > 0])
raise AttributeError("This property is unavailable for sparse datasets.")
@property
- def numerical(self) -> Index:
+ def numerical(self) -> list[str]:
"""Names of the numerical features in the dataset."""
- return self.X.select_dtypes(include=["number"]).columns
+ return list(self.branch.X.select_dtypes(include=["number"]).columns)
@property
def n_numerical(self) -> int:
@@ -363,9 +360,9 @@ def n_numerical(self) -> int:
return len(self.numerical)
@property
- def categorical(self) -> Index:
+ def categorical(self) -> list[str]:
"""Names of the categorical features in the dataset."""
- return self.X.select_dtypes(include=CAT_TYPES).columns
+ return list(self.branch.X.select_dtypes(include=CAT_TYPES).columns)
@property
def n_categorical(self) -> int:
@@ -379,7 +376,7 @@ def outliers(self) -> pd.Series:
This property is unavailable for [sparse datasets][].
"""
- if not is_sparse(self.X):
+ if not is_sparse(self.branch.X):
data = self.branch.train.select_dtypes(include=["number"])
z_scores = np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3
z_scores = pd.Series(z_scores.sum(axis=0), index=data.columns)
@@ -388,16 +385,16 @@ def outliers(self) -> pd.Series:
raise AttributeError("This property is unavailable for sparse datasets.")
@property
- def n_outliers(self) -> Int:
+ def n_outliers(self) -> int:
"""Number of samples in the training set containing outliers.
This property is unavailable for [sparse datasets][].
"""
- if not is_sparse(self.X):
+ if not is_sparse(self.branch.X):
data = self.branch.train.select_dtypes(include=["number"])
z_scores = np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3
- return z_scores.any(axis=1).sum()
+ return int(z_scores.any(axis=1).sum())
raise AttributeError("This property is unavailable for sparse datasets.")
@@ -429,14 +426,14 @@ def classes(self) -> pd.DataFrame:
raise AttributeError("This property is unavailable for regression tasks.")
@property
- def n_classes(self) -> Int | Series:
+ def n_classes(self) -> Int | pd.Series:
"""Number of classes in the target column(s).
This property is only available for classification tasks.
"""
if self.task.is_classification:
- return self.y.nunique(dropna=False)
+ return self.branch.y.nunique(dropna=False)
raise AttributeError("This property is unavailable for regression tasks.")
@@ -482,6 +479,9 @@ def checks(self, *, columns: ColumnSelector | None = None) -> pd.DataFrame:
- **p_value:** Corresponding p-value.
"""
+ from statsmodels.stats.diagnostic import acorr_ljungbox
+ from statsmodels.tsa.stattools import adfuller, kpss
+
columns_c = self.branch._get_columns(columns, only_numerical=True)
df = pd.DataFrame(
@@ -500,7 +500,8 @@ def checks(self, *, columns: ColumnSelector | None = None) -> pd.DataFrame:
if test == "adf":
stat = adfuller(X, maxlag=None, autolag="AIC")
elif test == "kpss":
- stat = kpss(X, regression="ct", nlags="auto") # ct is trend stationarity
+ # regression='ct' is trend stationarity
+ stat = kpss(X, regression="ct", nlags="auto")
elif test == "lb":
l_jung = acorr_ljungbox(X, lags=None, period=lst(self.sp.sp)[0])
stat = l_jung.loc[l_jung["lb_pvalue"].idxmin()]
@@ -671,7 +672,7 @@ def inverse_transform(
y: YSelector | None = None,
*,
verbose: Verbose | None = None,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Inversely transform new data through the pipeline.
Transformers that are only applied on the training set are
@@ -682,20 +683,18 @@ def inverse_transform(
Parameters
----------
- X: dataframe-like or None, default=None
- Transformed feature set with shape=(n_samples, n_features).
- If None, X is ignored in the transformers.
+ X: Transformed feature set with shape=(n_samples, n_features).
+ If None, `X` is ignored in the transformers.
- y: int, str, dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: int, str, sequence, dataframe-like or None, default=None
+ Transformed target column corresponding to `X`.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
+ - If None: `y` is ignored.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ - If dataframe-like: Target columns for multioutput tasks.
verbose: int or None, default=None
Verbosity level for the transformers in the pipeline. If
@@ -710,10 +709,10 @@ def inverse_transform(
Original target column. Only returned if provided.
"""
- X, y = self._check_input(X, y, columns=self.branch.features, name=self.branch.target)
+ Xt, yt = self._check_input(X, y, columns=self.branch.features, name=self.branch.target)
- with adjust_verbosity(self.pipeline, verbose) as pipeline:
- return pipeline.inverse_transform(X, y)
+ with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl:
+ return pl.inverse_transform(Xt, yt)
@classmethod
def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM:
@@ -749,12 +748,11 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM
**X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features).
- **y: int, str or sequence**
- Target column corresponding to `X`.
+ **y: int, str, sequence or dataframe**
+ Target column(s) corresponding to `X`.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput
tasks.
@@ -815,7 +813,7 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM
X_test, y_test = branch.pipeline.transform(branch.X_test, branch.y_test)
# Update complete dataset
- branch._container.data = bk.concat(
+ branch._container.data = pd.concat(
[merge(X_train, y_train), merge(X_test, y_test)]
)
@@ -824,7 +822,7 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM
data=(dataset := branch._container.data.reset_index(drop=True)),
train_idx=dataset.index[:len(branch._container.train_idx)],
test_idx=dataset.index[-len(branch._container.test_idx):],
- n_cols=branch._container.n_cols,
+ n_targets=branch._container.n_targets,
)
# Store inactive branches in memory
@@ -929,7 +927,7 @@ def shrink(
"""
- def get_data(new_t: DtypeObj) -> Series:
+ def get_data(new_t: DtypeObj) -> pd.Series:
"""Get the series with the right data format.
Also converts to sparse format if `dense2sparse=True`.
@@ -941,7 +939,7 @@ def get_data(new_t: DtypeObj) -> Series:
Returns
-------
- series
+ pd.Series
Object with the new data type.
"""
@@ -975,9 +973,6 @@ def get_data(new_t: DtypeObj) -> Series:
data = self.branch.dataset[self.branch._get_columns(columns)]
- # Convert back since convert_dtypes doesn't work properly for pyarrow dtypes
- data = data.astype({n: to_pyarrow(c, inverse=True) for n, c in data.items()})
-
# Convert to the best nullable dtype
data = data.convert_dtypes()
@@ -1012,11 +1007,6 @@ def get_data(new_t: DtypeObj) -> Series:
get_data(r[0]) for r in t if r[1] <= column.min() and r[2] >= column.max()
)
- if self.engine.data == "pyarrow":
- self.branch.dataset = self.dataset.astype(
- {name: to_pyarrow(col) for name, col in self.dataset.items()}
- )
-
self._log("The column dtypes are successfully converted.", 1)
@composed(crash, method_to_log)
@@ -1030,26 +1020,26 @@ def stats(self, _vb: Int = -2, /):
"""
self._log("Dataset stats " + "=" * 20 + " >>", _vb)
- self._log(f"Shape: {self.shape}", _vb)
+ self._log(f"Shape: {self.branch.shape}", _vb)
if self.task.is_forecast and self.sp.sp:
self._log(f"Seasonal period: {self.sp.sp}", _vb)
for ds in ("train", "test", "holdout"):
- if (data := getattr(self, ds)) is not None:
+ if (data := getattr(self.branch, ds)) is not None:
self._log(f"{ds.capitalize()} set size: {len(data)}", _vb)
if self.task.is_forecast:
self._log(f" --> From: {min(data.index)} To: {max(data.index)}", _vb)
self._log("-" * 37, _vb)
- if (memory := self.dataset.memory_usage().sum()) < 1e6:
+ if (memory := self.branch.dataset.memory_usage().sum()) < 1e6:
self._log(f"Memory: {memory / 1e3:.2f} kB", _vb)
else:
self._log(f"Memory: {memory / 1e6:.2f} MB", _vb)
- if is_sparse(self.X):
+ if is_sparse(self.branch.X):
self._log("Sparse: True", _vb)
- if hasattr(self.X, "sparse"): # All columns are sparse
- self._log(f"Density: {100. * self.X.sparse.density:.2f}%", _vb)
+ if hasattr(self.branch.X, "sparse"): # All columns are sparse
+ self._log(f"Density: {100. * self.branch.X.sparse.density:.2f}%", _vb)
else: # Not all columns are sparse
n_sparse = sum(isinstance(self[c].dtype, pd.SparseDtype) for c in self.features)
n_dense = self.n_features - n_sparse
@@ -1062,7 +1052,7 @@ def stats(self, _vb: Int = -2, /):
n_categorical = self.n_categorical
outliers = self.outliers.sum()
try: # Can fail for unhashable columns (e.g., multilabel with lists)
- duplicates = self.dataset.duplicated().sum()
+ duplicates = self.branch.dataset.duplicated().sum()
except TypeError:
duplicates = None
self._log(
@@ -1071,7 +1061,7 @@ def stats(self, _vb: Int = -2, /):
3,
)
- if not self.X.empty:
+ if not self.branch.X.empty:
self._log(f"Scaled: {self.scaled}", _vb)
if nans:
p_nans = round(100 * nans / self.branch.dataset.size, 1)
@@ -1103,31 +1093,29 @@ def transform(
y: YSelector | None = None,
*,
verbose: Verbose | None = None,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Transform new data through the pipeline.
Transformers that are only applied on the training set are
skipped. If only `X` or only `y` is provided, it ignores
transformers that require the other parameter. This can be
- of use to, for example, transform only the target column.
+ of use to, for example, transform only the target column.
Parameters
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored. If None,
- X is ignored in the transformers.
+ `X` is ignored.
- y: int, str, dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: int, str, sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
+ - If None: `y` is ignored.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ - If dataframe-like: Target columns for multioutput tasks.
verbose: int or None, default=None
Verbosity level for the transformers in the pipeline. If
@@ -1142,10 +1130,10 @@ def transform(
Transformed target column. Only returned if provided.
"""
- X, y = self._check_input(X, y, columns=self.og.features, name=self.og.target)
+ Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target)
- with adjust_verbosity(self.pipeline, verbose) as pipeline:
- return pipeline.transform(X, y)
+ with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl:
+ return pl.transform(Xt, yt)
# Base transformers ============================================ >>
@@ -1153,11 +1141,15 @@ def _prepare_kwargs(
self,
kwargs: dict[str, Any],
params: MappingProxyType | None = None,
+ *,
+ is_runner: Bool = False,
) -> dict[str, Any]:
"""Return kwargs with atom's values if not specified.
This method is used for all transformers and runners to pass
- atom's BaseTransformer's properties to the classes.
+ atom's BaseTransformer's properties to the classes. The engine
+ parameter is the only one that is modified for non-runners
+ since ATOM's transformers only accept the estimator engine.
Parameters
----------
@@ -1167,6 +1159,9 @@ def _prepare_kwargs(
params: mappingproxy or None, default=None
Parameters in the class' signature.
+ is_runner: bool, default=False
+ Whether the params are passed to a runner.
+
Returns
-------
dict
@@ -1175,7 +1170,12 @@ def _prepare_kwargs(
"""
for attr in BaseTransformer.attrs:
if (not params or attr in params) and attr not in kwargs:
- kwargs[attr] = getattr(self, attr)
+ if attr == "engine" and not is_runner:
+ # Engine parameter is special since we don't
+ # want to change data engines in the pipeline
+ kwargs[attr] = getattr(self, attr).estimator
+ else:
+ kwargs[attr] = getattr(self, attr)
return kwargs
@@ -1276,8 +1276,8 @@ def _add_transformer(
fit = self._memory.cache(fit_one)
kwargs = {
"estimator": transformer_c,
- "X": self.X_train,
- "y": self.y_train,
+ "X": self.branch.X_train,
+ "y": self.branch.y_train,
**fit_params,
}
@@ -1296,35 +1296,45 @@ def _add_transformer(
self._branches.add("og")
if transformer_c._train_only:
- X, y = self.pipeline._mem_transform(transformer_c, self.X_train, self.y_train)
- self.train = merge(
- self.X_train if X is None else X,
- self.y_train if y is None else y,
+ X, y = self.pipeline._mem_transform(
+ transformer=transformer_c,
+ X=self.branch.X_train,
+ y=self.branch.y_train,
+ )
+
+ self.branch.train = merge(
+ self.branch.X_train if X is None else X,
+ self.branch.y_train if y is None else y,
)
+
else:
- X, y = self.pipeline._mem_transform(transformer_c, self.X, self.y)
- data = merge(self.X if X is None else X, self.y if y is None else y)
+ X, y = self.pipeline._mem_transform(transformer_c, self.branch.X, self.branch.y)
+ data = merge(self.branch.X if X is None else X, self.branch.y if y is None else y)
# y can change the number of columns or remove rows -> reassign index
- self.branch._container = DataContainer(
- data=data,
- train_idx=self.branch._data.train_idx.intersection(data.index),
- test_idx=self.branch._data.test_idx.intersection(data.index),
- n_cols=self.branch._data.n_cols if y is None else len(get_cols(y)),
+ self._branches.fill(
+ DataContainer(
+ data=data,
+ train_idx=self.branch._data.train_idx.intersection(data.index),
+ test_idx=self.branch._data.test_idx.intersection(data.index),
+ n_targets=self.branch._data.n_targets if y is None else n_cols(y),
+ )
)
if self._config.index is False:
- self.branch._container = DataContainer(
- data=(data := self.dataset.reset_index(drop=True)),
- train_idx=data.index[: len(self.branch._data.train_idx)],
- test_idx=data.index[-len(self.branch._data.test_idx):],
- n_cols=self.branch._data.n_cols,
+ self._branches.fill(
+ DataContainer(
+ data=(data := self.branch.dataset.reset_index(drop=True)),
+ train_idx=data.index[: len(self.branch._data.train_idx)],
+ test_idx=data.index[-len(self.branch._data.test_idx):],
+ n_targets=self.branch._data.n_targets,
+ )
)
if self.branch._holdout is not None:
- self.branch._holdout.index = range(
- len(data), len(data) + len(self.branch._holdout)
+ self.branch._holdout.index = pd.Index(
+ range(len(data), len(data) + len(self.branch._holdout))
)
- elif self.dataset.index.duplicated().any():
+ elif self.branch.dataset.index.duplicated().any():
raise ValueError(
"Duplicate indices found in the dataset. "
"Try initializing atom using `index=False`."
@@ -1452,8 +1462,8 @@ def add(
@composed(crash, method_to_log)
def apply(
self,
- func: Callable[..., DataFrame],
- inverse_func: Callable[..., DataFrame] | None = None,
+ func: Callable[..., pd.DataFrame],
+ inverse_func: Callable[..., pd.DataFrame] | None = None,
*,
feature_names_out: FeatureNamesOut = None,
kw_args: dict[str, Any] | None = None,
@@ -1477,8 +1487,8 @@ def apply(
Parameters
----------
func: callable
- Function to apply with signature `func(dataset, **kw_args) ->
- dataset`.
+ Function to apply with signature `func(dataframe, **kw_args)
+ -> dataframe-like`.
inverse_func: callable or None, default=None
Inverse function of `func`. If None, the inverse_transform
@@ -1729,8 +1739,8 @@ def encode(
@composed(crash, method_to_log)
def impute(
self,
- strat_num: Scalar | NumericalStrats = "drop",
- strat_cat: str | CategoricalStrats = "drop",
+ strat_num: Scalar | NumericalStrats = "mean",
+ strat_cat: str | CategoricalStrats = "most_frequent",
*,
max_nan_rows: FloatLargerZero | None = None,
max_nan_cols: FloatLargerZero | None = None,
@@ -2215,7 +2225,7 @@ def _run(self, trainer: BaseRunner):
Instance that does the actual model training.
"""
- if any(col.dtype.kind not in "ifu" for col in get_cols(self.y)):
+ if any(col.dtype.kind not in "ifu" for col in get_cols(self.branch.y)):
raise ValueError(
"The target column is not numerical. Use atom.clean() "
"to encode the target column to numerical values."
@@ -2289,7 +2299,7 @@ def run(
n_bootstrap=n_bootstrap,
parallel=parallel,
errors=errors,
- **self._prepare_kwargs(kwargs),
+ **self._prepare_kwargs(kwargs, is_runner=True),
)
)
@@ -2351,7 +2361,7 @@ class for a description of the parameters.
n_bootstrap=n_bootstrap,
parallel=parallel,
errors=errors,
- **self._prepare_kwargs(kwargs),
+ **self._prepare_kwargs(kwargs, is_runner=True),
)
)
@@ -2411,6 +2421,6 @@ class for a description of the parameters.
n_bootstrap=n_bootstrap,
parallel=parallel,
errors=errors,
- **self._prepare_kwargs(kwargs),
+ **self._prepare_kwargs(kwargs, is_runner=True),
)
)
diff --git a/atom/basemodel.py b/atom/basemodel.py
index 584574595..170940b93 100644
--- a/atom/basemodel.py
+++ b/atom/basemodel.py
@@ -15,7 +15,7 @@
from importlib import import_module
from logging import Logger
from pathlib import Path
-from typing import Any, Literal, overload
+from typing import TYPE_CHECKING, Any, Literal, cast, overload
from unittest.mock import patch
import dill as pickle
@@ -23,7 +23,6 @@
import numpy as np
import optuna
import pandas as pd
-import ray
from beartype import beartype
from joblib.memory import Memory
from joblib.parallel import Parallel, delayed
@@ -37,7 +36,6 @@
from optuna.study import Study
from optuna.terminator import report_cross_validation_scores
from optuna.trial import FrozenTrial, Trial, TrialState
-from ray import serve
from sklearn.base import clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_curve
@@ -57,30 +55,33 @@
from sktime.performance_metrics.forecasting import make_forecasting_scorer
from sktime.proba.normal import Normal
from sktime.split import ExpandingWindowSplitter, SingleWindowSplitter
-from starlette.requests import Request
-from atom.branch import Branch, BranchManager
+from atom.data import Branch, BranchManager
from atom.data_cleaning import Scaler
from atom.pipeline import Pipeline
from atom.plots import RunnerPlot
from atom.utils.constants import DF_ATTRS
from atom.utils.patches import fit_and_score
from atom.utils.types import (
- HT, Backend, Bool, DataFrame, Engine, FHConstructor, Float,
- FloatZeroToOneExc, Index, Int, IntLargerEqualZero, MetricConstructor,
- MetricFunction, NJobs, Pandas, PredictionMethods, PredictionMethodsTS,
- Predictor, RowSelector, Scalar, Scorer, Sequence, Stages, TargetSelector,
- Verbose, Warnings, XSelector, YSelector, dataframe_t, float_t, int_t,
+ HT, Backend, Bool, Engine, FHConstructor, Float, FloatZeroToOneExc, Int,
+ IntLargerEqualZero, MetricConstructor, MetricFunction, NJobs, Pandas,
+ PredictionMethods, PredictionMethodsTS, Predictor, RowSelector, Scalar,
+ Scorer, Sequence, Stages, TargetSelector, Verbose, Warnings, XReturn,
+ XSelector, YReturn, YSelector, float_t, int_t,
)
from atom.utils.utils import (
ClassMap, DataConfig, Goal, PlotCallback, ShapExplanation, Task,
- TrialsCallback, adjust_verbosity, bk, cache, check_dependency, check_empty,
- check_scaling, composed, crash, estimator_has_attr, flt, get_cols,
- get_custom_scorer, has_task, it, lst, merge, method_to_log, rnd, sign,
- time_to_str, to_pandas,
+ TrialsCallback, adjust, cache, check_dependency, check_empty, composed,
+ crash, estimator_has_attr, flt, get_col_names, get_cols, get_custom_scorer,
+ has_task, it, lst, merge, method_to_log, rnd, sign, time_to_str, to_df,
+ to_series, to_tabular,
)
+if TYPE_CHECKING:
+ from starlette.requests import Request
+
+
# Disable optuna info logs (ATOM already displays the same info)
optuna.logging.set_verbosity(optuna.logging.WARNING)
@@ -129,9 +130,16 @@ class BaseModel(RunnerPlot):
- "data":
+ - "numpy"
- "pandas" (default)
+ - "pandas-pyarrow"
+ - "polars"
+ - "polars-lazy"
- "pyarrow"
- "modin"
+ - "dask"
+ - "pyspark"
+ - "pyspark-pandas"
- "estimator":
@@ -148,6 +156,7 @@ class BaseModel(RunnerPlot):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -264,9 +273,9 @@ def __init__(
self._branch = branches.current
self._train_idx = len(self.branch._data.train_idx) # Can change for sh and ts
- if hasattr(self, "needs_scaling"):
- if self.needs_scaling and not check_scaling(self.X, pipeline=self.pipeline):
- self.scaler = Scaler().fit(self.X_train)
+ if getattr(self, "needs_scaling", None) and not self.branch.check_scaling():
+ self.scaler = Scaler(device=self.device, engine=self.engine.estimator)
+ self.scaler.fit(self.X_train)
def __repr__(self) -> str:
"""Display class name."""
@@ -274,17 +283,25 @@ def __repr__(self) -> str:
def __dir__(self) -> list[str]:
"""Add additional attrs from __getattr__ to the dir."""
- attrs = list(super().__dir__())
+ # Exclude from _available_if conditions
+ attrs = [x for x in super().__dir__() if hasattr(self, x)]
+
if "_branch" in self.__dict__:
- attrs += [x for x in dir(self.branch) if not x.startswith("_")]
- attrs += list(DF_ATTRS)
+ # Add additional attrs from the branch
+ attrs += self.branch._get_shared_attrs()
+
+ # Add additional attrs from the dataset
+ attrs += [x for x in DF_ATTRS if hasattr(self.dataset, x)]
+
+ # Add column names (excluding those with spaces)
attrs += [c for c in self.columns if re.fullmatch(r"\w+$", c)]
+
return attrs
def __getattr__(self, item: str) -> Any:
"""Get attributes from branch or data."""
if "_branch" in self.__dict__:
- if item in dir(self.branch) and not item.startswith("_"):
+ if item in self.branch._get_shared_attrs():
return getattr(self.branch, item) # Get attr from branch
elif item in self.branch.columns:
return self.branch.dataset[item] # Get column
@@ -485,8 +502,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
def _fit_estimator(
self,
estimator: Predictor,
- data: tuple[DataFrame, Pandas],
- validation: tuple[DataFrame, Pandas] | None = None,
+ data: tuple[pd.DataFrame, Pandas],
+ validation: tuple[pd.DataFrame, Pandas] | None = None,
trial: Trial | None = None,
) -> Predictor:
"""Fit the estimator and perform in-training validation.
@@ -688,7 +705,7 @@ def _get_pred(
# Statsmodels models such as SARIMAX and DF require all
# exogenous data after the last row of the train set
# Other models accept this format
- Xe = bk.concat([self.test, self.holdout]) # type: ignore[list-item]
+ Xe = pd.concat([self.test, self.holdout])
exog = Xe.loc[Xe.index <= X.index.max(), self.features] # type: ignore[index]
y_pred = self._prediction(
@@ -704,7 +721,7 @@ def _get_pred(
f"Failed to get predictions for model {self.name} "
f"on rows {rows}. Returning NaN. Exception: {ex}.", 3
)
- y_pred = bk.Series([np.NaN] * len(X), index=X.index)
+ y_pred = pd.Series([np.NaN] * len(X), index=X.index)
else:
y_pred = self._prediction(X.index, verbose=0, method=method_caller)
@@ -722,7 +739,7 @@ def _score_from_est(
self,
scorer: Scorer,
estimator: Predictor,
- X: DataFrame,
+ X: pd.DataFrame,
y: Pandas,
**kwargs,
) -> Float:
@@ -736,11 +753,11 @@ def _score_from_est(
estimator: Predictor
Estimator instance to get the score from.
- X: dataframe
+ X: pd.DataFrame
Feature set.
- y: series or dataframe
- Target column corresponding to `X`.
+ y: pd.Series or pd.DataFrame
+ Target column(s) corresponding to `X`.
**kwargs
Additional keyword arguments for the `scorer`.
@@ -754,11 +771,10 @@ def _score_from_est(
if self.task.is_forecast:
y_pred = estimator.predict(fh=y.index, X=check_empty(X))
else:
- y_pred = to_pandas(
+ y_pred = to_tabular(
data=estimator.predict(X),
index=y.index,
- columns=getattr(y, "columns", None),
- name=getattr(y, "name", None),
+ columns=get_col_names(y),
)
return self._score_from_pred(scorer, y, y_pred, **kwargs)
@@ -854,7 +870,7 @@ def _get_score(
and hasattr(self.estimator, "predict_proba")
):
y_true, y_pred = self._get_pred(rows, method="predict_proba")
- if isinstance(y_pred, dataframe_t):
+ if isinstance(y_pred, pd.DataFrame):
# Update every target column with its corresponding threshold
for i, value in enumerate(threshold):
y_pred.iloc[:, i] = (y_pred.iloc[:, i] > value).astype("int")
@@ -1025,7 +1041,7 @@ def fit_model(
args.append(cols)
# Parallel loop over fit_model
- results = Parallel(n_jobs=self.n_jobs, backend=self.backend)(
+ results = Parallel(n_jobs=self.n_jobs)(
delayed(fit_model)(estimator, i, j) for i, j in splitter.split(*args)
)
@@ -1150,7 +1166,7 @@ def fit_model(
self._log(f"Time elapsed: {time_to_str(self.trials.iat[-1, -2])}", 1)
@composed(crash, method_to_log, beartype)
- def fit(self, X: DataFrame | None = None, y: Pandas | None = None):
+ def fit(self, X: pd.DataFrame | None = None, y: Pandas | None = None):
"""Fit and validate the model.
The estimator is fitted using the best hyperparameters found
@@ -1160,12 +1176,12 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None):
Parameters
----------
- X: dataframe or None
+ X: pd.DataFrame or None
Feature set with shape=(n_samples, n_features). If None,
`self.X_train` is used.
- y: series, dataframe or None
- Target column corresponding to `X`. If None, `self.y_train`
+ y: pd.Series, pd.DataFrame or None
+ Target column(s) corresponding to `X`. If None, `self.y_train`
is used.
"""
@@ -1233,28 +1249,25 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None):
sk_model=self.estimator,
artifact_path=self._est_class.__name__,
signature=infer_signature(
- model_input=pd.DataFrame(self.X),
+ model_input=self.X,
model_output=self.estimator.predict(self.X_test.iloc[[0]]),
),
- input_example=pd.DataFrame(self.X.iloc[[0]]),
+ input_example=self.X.iloc[[0]],
)
if self.log_data:
for ds in ("train", "test"):
- mlflow.log_input(
- dataset=from_pandas(pd.DataFrame(getattr(self, ds))),
- context=ds,
- )
+ mlflow.log_input(dataset=from_pandas(getattr(self, ds)), context=ds)
if self.log_pipeline:
mlflow.sklearn.log_model(
sk_model=self.export_pipeline(),
artifact_path=f"{self._est_class.__name__}_pipeline",
signature=infer_signature(
- model_input=pd.DataFrame(self.X),
+ model_input=self.X,
model_output=self.estimator.predict(self.X_test.iloc[[0]]),
),
- input_example=pd.DataFrame(self.X.iloc[[0]]),
+ input_example=self.X.iloc[[0]],
)
@composed(crash, method_to_log, beartype)
@@ -1629,22 +1642,22 @@ def pipeline(self) -> Pipeline:
return self.branch.pipeline
@property
- def dataset(self) -> DataFrame:
+ def dataset(self) -> pd.DataFrame:
"""Complete data set."""
return merge(self.X, self.y)
@property
- def train(self) -> DataFrame:
+ def train(self) -> pd.DataFrame:
"""Training set."""
return merge(self.X_train, self.y_train)
@property
- def test(self) -> DataFrame:
+ def test(self) -> pd.DataFrame:
"""Test set."""
return merge(self.X_test, self.y_test)
@property
- def holdout(self) -> DataFrame | None:
+ def holdout(self) -> pd.DataFrame | None:
"""Holdout set."""
if (holdout := self.branch.holdout) is not None:
if self.scaler:
@@ -1655,23 +1668,24 @@ def holdout(self) -> DataFrame | None:
return None
@property
- def X(self) -> DataFrame:
+ def X(self) -> pd.DataFrame:
"""Feature set."""
- return bk.concat([self.X_train, self.X_test])
+ return pd.concat([self.X_train, self.X_test])
@property
def y(self) -> Pandas:
- """Target column."""
- return bk.concat([self.y_train, self.y_test])
+ """Target column(s)."""
+ return pd.concat([self.y_train, self.y_test])
@property
- def X_train(self) -> DataFrame:
+ def X_train(self) -> pd.DataFrame:
"""Features of the training set."""
features = self.branch.features.isin(self._config.ignore)
+ X_train = self.branch.X_train.iloc[-self._train_idx:, ~features]
if self.scaler:
- return self.scaler.transform(self.branch.X_train.iloc[-self._train_idx:, ~features])
+ return cast(pd.DataFrame, self.scaler.transform(X_train))
else:
- return self.branch.X_train.iloc[-self._train_idx:, ~features]
+ return X_train
@property
def y_train(self) -> Pandas:
@@ -1679,16 +1693,17 @@ def y_train(self) -> Pandas:
return self.branch.y_train[-self._train_idx:]
@property
- def X_test(self) -> DataFrame:
+ def X_test(self) -> pd.DataFrame:
"""Features of the test set."""
features = self.branch.features.isin(self._config.ignore)
+ X_test = self.branch.X_test.iloc[:, ~features]
if self.scaler:
- return self.scaler.transform(self.branch.X_test.iloc[:, ~features])
+ return cast(pd.DataFrame, self.scaler.transform(X_test))
else:
- return self.branch.X_test.iloc[:, ~features]
+ return X_test
@property
- def X_holdout(self) -> DataFrame | None:
+ def X_holdout(self) -> pd.DataFrame | None:
"""Features of the holdout set."""
if self.holdout is not None:
return self.holdout[self.features]
@@ -1709,34 +1724,34 @@ def shape(self) -> tuple[Int, Int]:
return self.dataset.shape
@property
- def columns(self) -> Index:
+ def columns(self) -> list[str]:
"""Name of all the columns."""
- return self.dataset.columns
+ return list(self.dataset.columns)
@property
- def n_columns(self) -> Int:
+ def n_columns(self) -> int:
"""Number of columns."""
return len(self.columns)
@property
- def features(self) -> Index:
+ def features(self) -> list[str]:
"""Name of the features."""
- return self.columns[:-self.branch._data.n_cols]
+ return list(self.columns[:-self.branch._data.n_targets])
@property
- def n_features(self) -> Int:
+ def n_features(self) -> int:
"""Number of features."""
return len(self.features)
@property
- def _all(self) -> DataFrame:
+ def _all(self) -> pd.DataFrame:
"""Dataset + holdout.
Note that calling this property triggers the holdout set
calculation.
"""
- return bk.concat([self.dataset, self.holdout])
+ return pd.concat([self.dataset, self.holdout])
# Utility methods ============================================== >>
@@ -1837,8 +1852,7 @@ def inference(*X) -> Scalar | str | list[Scalar | str]:
"""
conv = lambda elem: elem.item() if hasattr(elem, "item") else elem
- y_pred = self.inverse_transform(y=self.predict([X], verbose=0), verbose=0)
- if isinstance(y_pred, dataframe_t):
+ if isinstance(y_pred := self.predict([X], verbose=0), pd.DataFrame):
return [conv(elem) for elem in y_pred.iloc[0, :]]
else:
return conv(y_pred[0])
@@ -1859,7 +1873,7 @@ def inference(*X) -> Scalar | str | list[Scalar | str]:
self.app = Interface(
fn=inference,
inputs=inputs,
- outputs=["label"] * self.branch._data.n_cols,
+ outputs=["label"] * self.branch._data.n_targets,
allow_flagging=kwargs.pop("allow_flagging", "never"),
**{k: v for k, v in kwargs.items() if k in sign(Interface)},
)
@@ -2082,12 +2096,12 @@ def evaluate(
"""
if isinstance(threshold, float_t):
- threshold_c = [threshold] * self.branch._data.n_cols # Length=n_targets
- elif len(threshold) != self.branch._data.n_cols:
+ threshold_c = [threshold] * self.branch._data.n_targets # Length=n_targets
+ elif len(threshold) != self.branch._data.n_targets:
raise ValueError(
"Invalid value for the threshold parameter. The length of the list "
f"list should be equal to the number of target columns, got len(target)"
- f"={self.branch._data.n_cols} and len(threshold)={len(threshold)}."
+ f"={self.branch._data.n_targets} and len(threshold)={len(threshold)}."
)
else:
threshold_c = list(threshold)
@@ -2184,11 +2198,11 @@ def full_train(self, *, include_holdout: Bool = False):
if include_holdout and self.holdout is None:
raise ValueError("No holdout data set available.")
- if include_holdout and self.holdout is not None:
- X = bk.concat([self.X, self.X_holdout])
- y = bk.concat([self.y, self.y_holdout])
- else:
+ if not include_holdout:
X, y = self.X, self.y
+ else:
+ X = pd.concat([self.X, self.X_holdout])
+ y = pd.concat([self.y, self.y_holdout])
# Assign a mlflow run to the new estimator
if self.experiment:
@@ -2234,11 +2248,11 @@ def inverse_transform(
y: YSelector | None = None,
*,
verbose: Verbose | None = None,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Inversely transform new data through the pipeline.
Transformers that are only applied on the training set are
- skipped. The rest should all implement a `inverse_transform`
+ skipped. The rest should all implement an `inverse_transform`
method. If only `X` or only `y` is provided, it ignores
transformers that require the other parameter. This can be
of use to, for example, inversely transform only the target
@@ -2249,18 +2263,17 @@ def inverse_transform(
----------
X: dataframe-like or None, default=None
Transformed feature set with shape=(n_samples, n_features).
- If None, X is ignored in the transformers.
+ If None, `X` is ignored in the transformers.
- y: int, str, dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: int, str, sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
+ - If None: `y` is ignored.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ - If dataframe-like: Target columns for multioutput tasks.
verbose: int or None, default=None
Verbosity level for the transformers in the pipeline. If
@@ -2275,10 +2288,10 @@ def inverse_transform(
Original target column. Only returned if provided.
"""
- X, y = self._check_input(X, y, columns=self.branch.features, name=self.branch.target)
+ Xt, yt = self._check_input(X, y, columns=self.branch.features, name=self.branch.target)
- with adjust_verbosity(self.pipeline, verbose) as pipeline:
- return pipeline.inverse_transform(X, y)
+ with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl:
+ return pl.inverse_transform(Xt, yt)
@composed(crash, method_to_log, beartype)
def register(
@@ -2378,8 +2391,11 @@ def serve(self, method: str = "predict", host: str = "127.0.0.1", port: Int = 80
Port for HTTP server.
"""
+ check_dependency("ray")
+ import ray
+ from ray.serve import deployment, run
- @serve.deployment
+ @deployment
class ServeModel:
"""Model deployment class.
@@ -2413,16 +2429,12 @@ async def __call__(self, request: Request) -> np.ndarray:
"""
payload = await request.json()
- return getattr(self.pipeline, self.method)(bk.read_json(payload))
+ return getattr(self.pipeline, self.method)(pd.read_json(payload))
if not ray.is_initialized():
ray.init(log_to_driver=False)
- server = ServeModel.bind(
- pipeline=self.export_pipeline(),
- method=method,
- )
- serve.run(server, host=host, port=port)
+ run(ServeModel.bind(pipeline=self.export_pipeline(), method=method), host=host, port=port)
self._log(f"Serving model {self.fullname} on {host}:{port}...", 1)
@@ -2433,7 +2445,7 @@ def transform(
y: YSelector | None = None,
*,
verbose: Verbose | None = None,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Transform new data through the pipeline.
Transformers that are only applied on the training set are
@@ -2447,19 +2459,18 @@ def transform(
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored. If None,
- X is ignored in the transformers.
+ `X` is ignored. If None,
+ `X` is ignored in the transformers.
- y: int, str, dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: int, str, sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
+ - If None: `y` is ignored.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ - If dataframe-like: Target columns for multioutput tasks.
verbose: int or None, default=None
Verbosity level for the transformers in the pipeline. If
@@ -2474,10 +2485,10 @@ def transform(
Transformed target column. Only returned if provided.
"""
- X, y = self._check_input(X, y, columns=self.og.features, name=self.og.target)
+ Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target)
- with adjust_verbosity(self.pipeline, verbose) as pipeline:
- return pipeline.transform(X, y)
+ with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl:
+ return pl.transform(Xt, yt)
class ClassRegModel:
@@ -2517,20 +2528,25 @@ def _prediction(
y: YSelector | None = ...,
metric: str | MetricFunction | Scorer | None = ...,
sample_weight: Sequence[Scalar] | None = ...,
- verbose: Int | None = ...,
- method: Literal["score"] = ...,
- ) -> Float: ...
+ verbose: Verbose | None = ...,
+ method: Literal[
+ "decision_function",
+ "predict",
+ "predict_log_proba",
+ "predict_proba",
+ ] = ...,
+ ) -> Pandas: ...
@overload
def _prediction(
self,
X: RowSelector | XSelector,
- y: YSelector | None = ...,
- metric: str | MetricFunction | Scorer | None = ...,
- sample_weight: Sequence[Scalar] | None = ...,
- verbose: Int | None = ...,
- method: PredictionMethods = ...,
- ) -> Pandas: ...
+ y: YSelector | None,
+ metric: str | MetricFunction | Scorer | None,
+ sample_weight: Sequence[Scalar] | None,
+ verbose: Verbose | None,
+ method: Literal["score"],
+ ) -> Float: ...
def _prediction(
self,
@@ -2538,7 +2554,7 @@ def _prediction(
y: YSelector | None = None,
metric: str | MetricFunction | Scorer | None = None,
sample_weight: Sequence[Scalar] | None = None,
- verbose: Int | None = None,
+ verbose: Verbose | None = None,
method: PredictionMethods = "predict",
) -> Float | Pandas:
"""Get predictions on new data or existing rows.
@@ -2554,13 +2570,12 @@ def _prediction(
set with shape=(n_samples, n_features) to make predictions
on.
- y: int, str, dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: int, str, sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
+ - If None: `y` is ignored.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput
tasks.
@@ -2590,30 +2605,38 @@ def _prediction(
"""
- def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[DataFrame, Pandas]:
+ def get_transform_X_y(
+ X: RowSelector | XSelector,
+ y: YSelector | None,
+ ) -> tuple[pd.DataFrame, Pandas | None]:
"""Get X and y from the pipeline transformation.
Parameters
----------
- X: dataframe-like
- Feature set.
+ X: hashable, segment, sequence or dataframe-like
+ Feature set. If not dataframe-like, expected to fail.
- y: int, str or sequence
- Target column.
+ y: int, str, sequence, dataframe-like or None
+ Target column(s) corresponding to `X`.
Returns
-------
dataframe
Transformed feature set.
- series or dataframe
+ series, dataframe or None
Transformed target column.
"""
- if isinstance(out := self.transform(X, y, verbose=verbose), tuple):
+ Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target)
+
+ with adjust(self.pipeline, verbose=verbose) as pl:
+ out = pl.transform(Xt, yt)
+
+ if isinstance(out, tuple):
return out
else:
- return out, y
+ return out, yt
def assign_prediction_columns() -> list[str]:
"""Assign column names for the prediction methods.
@@ -2630,7 +2653,7 @@ def assign_prediction_columns() -> list[str]:
return self.mapping.get(self.target, np.unique(self.y).astype(str))
try:
- if isinstance(X, dataframe_t):
+ if isinstance(X, pd.DataFrame):
# Dataframe must go first since we can expect
# prediction calls from dataframes with reset indices
Xt, yt = get_transform_X_y(X, y)
@@ -2645,32 +2668,27 @@ def assign_prediction_columns() -> list[str]:
if method != "score":
pred = np.array(self.memory.cache(getattr(self.estimator, method))(Xt[self.features]))
- if pred.ndim < 3:
- data = to_pandas(
- data=pred,
- index=Xt.index,
- name=self.target,
- columns=assign_prediction_columns(),
- )
+ if pred.ndim == 1 or pred.shape[1] == 1:
+ return to_series(pred, index=Xt.index, name=self.target)
+ elif pred.ndim < 3:
+ return to_df(pred, index=Xt.index, columns=assign_prediction_columns())
elif self.task is Task.multilabel_classification:
# Convert to (n_samples, n_targets)
- data = bk.DataFrame(
+ return pd.DataFrame(
data=np.array([d[:, 1] for d in pred]).T,
index=Xt.index,
columns=assign_prediction_columns(),
)
else:
# Convert to (n_samples * n_classes, n_targets)
- data = bk.DataFrame(
+ return pd.DataFrame(
data=pred.reshape(-1, pred.shape[2]),
- index=bk.MultiIndex.from_tuples(
+ index=pd.MultiIndex.from_tuples(
[(col, idx) for col in np.unique(self.y) for idx in Xt.index]
),
columns=assign_prediction_columns(),
)
- return data
-
else:
if metric is None:
scorer = self._metric[0]
@@ -2691,8 +2709,8 @@ def decision_function(
self,
X: RowSelector | XSelector,
*,
- verbose: Int | None = None,
- ) -> Pandas:
+ verbose: Verbose | None = None,
+ ) -> YReturn:
"""Get confidence scores on new data or existing rows.
New data is first transformed through the model's pipeline.
@@ -2721,7 +2739,7 @@ def decision_function(
multiclass classification tasks.
"""
- return self._prediction(X, verbose=verbose, method="decision_function")
+ return self._convert(self._prediction(X, verbose=verbose, method="decision_function"))
@available_if(estimator_has_attr("predict"))
@composed(crash, method_to_log, beartype)
@@ -2730,8 +2748,8 @@ def predict(
X: RowSelector | XSelector,
*,
inverse: Bool = True,
- verbose: Int | None = None,
- ) -> Pandas:
+ verbose: Verbose | None = None,
+ ) -> YReturn:
"""Get predictions on new data or existing rows.
New data is first transformed through the model's pipeline.
@@ -2769,7 +2787,7 @@ def predict(
if inverse:
return self.inverse_transform(y=pred)
else:
- return pred
+ return self._convert(pred)
@available_if(estimator_has_attr("predict_log_proba"))
@composed(crash, method_to_log, beartype)
@@ -2777,8 +2795,8 @@ def predict_log_proba(
self,
X: RowSelector | XSelector,
*,
- verbose: Int | None = None,
- ) -> DataFrame:
+ verbose: Verbose | None = None,
+ ) -> XReturn:
"""Get class log-probabilities on new data or existing rows.
New data is first transformed through the model's pipeline.
@@ -2806,7 +2824,7 @@ def predict_log_proba(
a multiindex format for [multioutput tasks][].
"""
- return self._prediction(X, verbose=verbose, method="predict_log_proba")
+ return self._convert(self._prediction(X, verbose=verbose, method="predict_log_proba"))
@available_if(estimator_has_attr("predict_proba"))
@composed(crash, method_to_log, beartype)
@@ -2814,8 +2832,8 @@ def predict_proba(
self,
X: RowSelector | XSelector,
*,
- verbose: Int | None = None,
- ) -> DataFrame:
+ verbose: Verbose | None = None,
+ ) -> XReturn:
"""Get class probabilities on new data or existing rows.
New data is first transformed through the model's pipeline.
@@ -2843,7 +2861,7 @@ def predict_proba(
a multiindex format for [multioutput tasks][].
"""
- return self._prediction(X, verbose=verbose, method="predict_proba")
+ return self._convert(self._prediction(X, verbose=verbose, method="predict_proba"))
@available_if(estimator_has_attr("score"))
@composed(crash, method_to_log, beartype)
@@ -2854,7 +2872,7 @@ def score(
*,
metric: str | MetricFunction | Scorer | None = None,
sample_weight: Sequence[Scalar] | None = None,
- verbose: Int | None = None,
+ verbose: Verbose | None = None,
) -> Float:
"""Get a metric score on new data.
@@ -2876,13 +2894,12 @@ def score(
set with shape=(n_samples, n_features) to make predictions
on.
- y: int, str, dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: int, str, sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
- If None: `X` must be a selection of rows in the dataset.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput
tasks.
@@ -2947,26 +2964,44 @@ def get_tags(self) -> dict[str, Any]:
@overload
def _prediction(
self,
- fh: RowSelector | FHConstructor | None = None,
- y: RowSelector | YSelector | None = None,
- X: XSelector | None = None,
- metric: str | MetricFunction | Scorer | None = None,
- verbose: Int | None = None,
- method: Literal["score"] = ...,
+ fh: RowSelector | FHConstructor | None = ...,
+ y: RowSelector | YSelector | None = ...,
+ X: XSelector | None = ...,
+ metric: str | MetricFunction | Scorer | None = ...,
+ verbose: Verbose | None = ...,
+ method: Literal[
+ "predict",
+ "predict_interval",
+ "predict_quantiles",
+ "predict_residuals",
+ "predict_var",
+ ] = ...,
**kwargs,
- ) -> Float: ...
+ ) -> Pandas: ...
@overload
def _prediction(
self,
- fh: RowSelector | FHConstructor | None = None,
- y: RowSelector | YSelector | None = None,
- X: XSelector | None = None,
- metric: str | MetricFunction | Scorer | None = None,
- verbose: Int | None = None,
- method: PredictionMethodsTS = ...,
+ fh: RowSelector | FHConstructor | None,
+ y: RowSelector | YSelector | None,
+ X: XSelector | None,
+ metric: str | MetricFunction | Scorer | None,
+ verbose: Verbose | None,
+ method: Literal["predict_proba"],
**kwargs,
- ) -> Pandas: ...
+ ) -> Normal: ...
+
+ @overload
+ def _prediction(
+ self,
+ fh: RowSelector | FHConstructor | None,
+ y: RowSelector | YSelector | None,
+ X: XSelector | None,
+ metric: str | MetricFunction | Scorer | None,
+ verbose: Verbose | None,
+ method: Literal["score"],
+ **kwargs,
+ ) -> Float: ...
def _prediction(
self,
@@ -2974,10 +3009,10 @@ def _prediction(
y: RowSelector | YSelector | None = None,
X: XSelector | None = None,
metric: str | MetricFunction | Scorer | None = None,
- verbose: Int | None = None,
+ verbose: Verbose | None = None,
method: PredictionMethodsTS = "predict",
**kwargs,
- ) -> Float | Pandas:
+ ) -> Float | Normal | Pandas:
"""Get predictions on new data or existing rows.
New data is first transformed through the model's pipeline.
@@ -2990,7 +3025,7 @@ def _prediction(
The [forecasting horizon][row-and-column-selection] encoding
the time stamps to forecast at.
- y: int, str, dict, sequence, dataframe or None, default=None
+ y: int, str, sequence, dataframe-like or None, default=None
Ground truth observations.
X: hashable, segment, sequence, dataframe-like or None, default=None
@@ -3014,18 +3049,23 @@ def _prediction(
Returns
-------
- float, series or dataframe
+ float, sktime.proba.[Normal][], series or dataframe
Calculated predictions. The return type depends on the method
called.
"""
if y is not None or X is not None:
- if isinstance(out := self.transform(X, y, verbose=verbose), tuple):
+ Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target)
+
+ with adjust(self.pipeline, verbose=verbose) as pl:
+ out = pl.transform(Xt, yt)
+
+ if isinstance(out, tuple):
Xt, yt = out
elif X is not None:
- Xt, yt = out, y
+ Xt, yt = out, yt
else:
- Xt, yt = X, out
+ Xt, yt = Xt, out
else:
Xt, yt = X, y
@@ -3051,8 +3091,9 @@ def predict(
fh: RowSelector | FHConstructor,
X: XSelector | None = None,
*,
- verbose: Int | None = None,
- ) -> Pandas:
+ inverse: Bool = True,
+ verbose: Verbose | None = None,
+ ) -> YReturn:
"""Get predictions on new data or existing rows.
New data is first transformed through the model's pipeline.
@@ -3070,6 +3111,12 @@ def predict(
X: hashable, segment, sequence, dataframe-like or None, default=None
Exogenous time series corresponding to `fh`.
+ inverse: bool, default=True
+ Whether to inversely transform the output through the
+ pipeline. This doesn't affect the predictions if there are
+ no transformers in the pipeline or if the transformers have
+ no `inverse_transform` method or don't apply to `y`.
+
verbose: int or None, default=None
Verbosity level for the transformers in the pipeline. If
None, it uses the pipeline's verbosity.
@@ -3081,7 +3128,12 @@ def predict(
n_targets) for [multivariate][] tasks.
"""
- return self._prediction(fh=fh, X=X, verbose=verbose, method="predict")
+ pred = self._prediction(fh=fh, X=X, verbose=verbose, method="predict")
+
+ if inverse:
+ return self.inverse_transform(y=pred)
+ else:
+ return self._convert(pred)
@available_if(estimator_has_attr("predict_interval"))
@composed(crash, method_to_log, beartype)
@@ -3091,8 +3143,8 @@ def predict_interval(
X: XSelector | None = None,
*,
coverage: Float | Sequence[Float] = 0.9,
- verbose: Int | None = None,
- ) -> DataFrame:
+ verbose: Verbose | None = None,
+ ) -> XReturn:
"""Get prediction intervals on new data or existing rows.
New data is first transformed through the model's pipeline.
@@ -3123,12 +3175,14 @@ def predict_interval(
Computed interval forecasts.
"""
- return self._prediction(
- fh=fh,
- X=X,
- coverage=coverage,
- verbose=verbose,
- method="predict_interval",
+ return self._convert(
+ self._prediction(
+ fh=fh,
+ X=X,
+ coverage=coverage,
+ verbose=verbose,
+ method="predict_interval",
+ )
)
@available_if(estimator_has_attr("predict_proba"))
@@ -3139,7 +3193,7 @@ def predict_proba(
X: XSelector | None = None,
*,
marginal: Bool = True,
- verbose: Int | None = None,
+ verbose: Verbose | None = None,
) -> Normal:
"""Get probabilistic forecasts on new data or existing rows.
@@ -3187,8 +3241,8 @@ def predict_quantiles(
X: XSelector | None = None,
*,
alpha: Float | Sequence[Float] = (0.05, 0.95),
- verbose: Int | None = None,
- ) -> DataFrame:
+ verbose: Verbose | None = None,
+ ) -> XReturn:
"""Get quantile forecasts on new data or existing rows.
New data is first transformed through the model's pipeline.
@@ -3220,12 +3274,14 @@ def predict_quantiles(
Computed quantile forecasts.
"""
- return self._prediction(
- fh=fh,
- X=X,
- alpha=alpha,
- verbose=verbose,
- method="predict_quantiles",
+ return self._convert(
+ self._prediction(
+ fh=fh,
+ X=X,
+ alpha=alpha,
+ verbose=verbose,
+ method="predict_quantiles",
+ )
)
@available_if(estimator_has_attr("predict_residuals"))
@@ -3235,8 +3291,8 @@ def predict_residuals(
y: RowSelector | YSelector,
X: XSelector | None = None,
*,
- verbose: Int | None = None,
- ) -> Pandas:
+ verbose: Verbose | None = None,
+ ) -> YReturn:
"""Get residuals of forecasts on new data or existing rows.
New data is first transformed through the model's pipeline.
@@ -3247,7 +3303,7 @@ def predict_residuals(
Parameters
----------
- y: int, str, dict, sequence or dataframe
+ y: int, str, sequence or dataframe
Ground truth observations.
X: hashable, segment, sequence, dataframe-like or None, default=None
@@ -3264,7 +3320,9 @@ def predict_residuals(
n_targets) for [multivariate][] tasks.
"""
- return self._prediction(y=y, X=X, verbose=verbose, method="predict_residuals")
+ return self._convert(
+ self._prediction(y=y, X=X, verbose=verbose, method="predict_residuals")
+ )
@available_if(estimator_has_attr("predict_var"))
@composed(crash, method_to_log, beartype)
@@ -3274,8 +3332,8 @@ def predict_var(
X: XSelector | None = None,
*,
cov: Bool = False,
- verbose: Int | None = None,
- ) -> DataFrame:
+ verbose: Verbose | None = None,
+ ) -> XReturn:
"""Get variance forecasts on new data or existing rows.
New data is first transformed through the model's pipeline.
@@ -3307,12 +3365,14 @@ def predict_var(
Computed variance forecasts.
"""
- return self._prediction(
- fh=fh,
- X=X,
- cov=cov,
- verbose=verbose,
- method="predict_var",
+ return self._convert(
+ self._prediction(
+ fh=fh,
+ X=X,
+ cov=cov,
+ verbose=verbose,
+ method="predict_var",
+ )
)
@available_if(estimator_has_attr("score"))
@@ -3324,7 +3384,7 @@ def score(
fh: RowSelector | FHConstructor | None = None,
*,
metric: str | MetricFunction | Scorer | None = None,
- verbose: Int | None = None,
+ verbose: Verbose | None = None,
) -> Float:
"""Get a metric score on new data.
@@ -3341,7 +3401,7 @@ def score(
Parameters
----------
- y: int, str, dict, sequence or dataframe
+ y: int, str, sequence or dataframe-like
Ground truth observations.
X: hashable, segment, sequence, dataframe-like or None, default=None
diff --git a/atom/baserunner.py b/atom/baserunner.py
index c85f95222..bc128d132 100644
--- a/atom/baserunner.py
+++ b/atom/baserunner.py
@@ -32,19 +32,18 @@
from atom.basetracker import BaseTracker
from atom.basetransformer import BaseTransformer
-from atom.branch import Branch
+from atom.data import Branch
from atom.models import MODELS, Stacking, Voting
from atom.pipeline import Pipeline
from atom.utils.constants import DF_ATTRS
from atom.utils.types import (
- Bool, DataFrame, FloatZeroToOneExc, HarmonicsSelector, IndexSelector, Int,
+ Bool, FloatZeroToOneExc, HarmonicsSelector, IndexSelector, Int,
IntLargerOne, MetricConstructor, Model, ModelSelector, ModelsSelector,
- Pandas, RowSelector, Seasonality, Segment, Sequence, Series, SPDict,
- SPTuple, TargetSelector, YSelector, bool_t, dataframe_t, int_t, segment_t,
- sequence_t,
+ Pandas, RowSelector, Seasonality, Segment, Sequence, SPDict, SPTuple,
+ TargetSelector, YSelector, bool_t, int_t, pandas_t, segment_t, sequence_t,
)
from atom.utils.utils import (
- ClassMap, DataContainer, Goal, SeasonalPeriod, Task, bk, check_is_fitted,
+ ClassMap, DataContainer, Goal, SeasonalPeriod, Task, check_is_fitted,
composed, crash, divide, flt, get_cols, get_segment, get_versions,
has_task, lst, merge, method_to_log, n_cols,
)
@@ -80,27 +79,42 @@ def __setstate__(self, state: dict[str, Any]):
def __dir__(self) -> list[str]:
"""Add additional attrs from __getattr__ to the dir."""
- attrs = list(super().__dir__())
- attrs += [x for x in dir(self.branch) if not x.startswith("_")]
- attrs += list(DF_ATTRS)
+ # Exclude from _available_if conditions
+ attrs = [x for x in super().__dir__() if hasattr(self, x)]
+
+ # Add additional attrs from the branch
+ attrs += self.branch._get_shared_attrs()
+
+ # Add additional attrs from the dataset
+ attrs += [x for x in DF_ATTRS if hasattr(self.dataset, x)]
+
+ # Add branch names in lower-case
attrs += [b.name.lower() for b in self._branches]
+
+ # Add column names (excluding those with spaces)
attrs += [c for c in self.columns if re.fullmatch(r"\w+$", c)]
+
+ # Add model names in lower-case
if isinstance(self._models, ClassMap):
attrs += [m.name.lower() for m in self._models]
+
return attrs
def __getattr__(self, item: str) -> Any:
"""Get branch, attr from branch, model, column or attr from dataset."""
if item in self.__dict__["_branches"]:
return self._branches[item] # Get branch
- elif item in dir(self.branch) and not item.startswith("_"):
- return getattr(self.branch, item) # Get attr from branch
+ elif item in self.branch._get_shared_attrs():
+ if isinstance(attr := getattr(self.branch, item), pandas_t):
+ return self._convert(attr) # Transform data through data engine
+ else:
+ return attr
elif item in self.__dict__["_models"]:
return self._models[item] # Get model
elif item in self.branch.columns:
return self.branch.dataset[item] # Get column from dataset
- elif item in DF_ATTRS:
- return getattr(self.branch.dataset, item) # Get attr from dataset
+ elif item in DF_ATTRS and hasattr(self.dataset, item):
+ return getattr(self.dataset, item) # Get attr from dataset
else:
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'.")
@@ -120,7 +134,7 @@ def __delattr__(self, item: str):
def __len__(self) -> int:
"""Return length of dataset."""
- return len(self.dataset)
+ return len(self.branch.dataset)
def __contains__(self, item: str) -> bool:
"""Whether the item is a column in the dataset."""
@@ -159,7 +173,7 @@ def __sklearn_is_fitted__(self) -> bool:
@cached_property
def task(self) -> Task:
"""Dataset's [task][] type."""
- return self._goal.infer_task(self.y)
+ return self._goal.infer_task(self.branch.y)
@property
def sp(self) -> SPTuple:
@@ -202,14 +216,14 @@ def branch(self) -> Branch:
return self._branches.current
@property
- def holdout(self) -> DataFrame | None:
+ def holdout(self) -> pd.DataFrame | None:
"""Holdout set.
This data set is untransformed by the pipeline. Read more in
the [user guide][data-sets].
"""
- return self.branch._holdout
+ return self._convert(self.branch._holdout)
@property
def models(self) -> str | list[str] | None:
@@ -378,11 +392,11 @@ def get_single_sp(sp: Int | str) -> int:
def _get_data(
self,
- arrays: tuple,
+ arrays: tuple[Any, ...],
y: YSelector = -1,
*,
- index: IndexSelector = False,
- ) -> tuple[DataContainer, DataFrame | None]:
+ index: IndexSelector | None = None,
+ ) -> tuple[DataContainer, pd.DataFrame | None]:
"""Get data sets from a sequence of indexables.
Also assigns an index, (stratified) shuffles and selects a
@@ -396,20 +410,21 @@ def _get_data(
y: int, str or sequence, default=-1
Transformed target column.
- index: bool, int, str or sequence, default=False
- Index parameter as provided in constructor.
+ index: bool, int, str, sequence or None, default=None
+ Index parameter as provided in constructor. If None, the
+ index is retrieved from `self._config`.
Returns
-------
DataContainer
Train and test sets.
- dataframe or None
+ pd.DataFrame or None
Holdout data set. Returns None if not specified.
"""
- def _subsample(df: DataFrame) -> DataFrame:
+ def _subsample(df: pd.DataFrame) -> pd.DataFrame:
"""Select a random subset of a dataframe.
If shuffle=True, the subset is shuffled, else row order
@@ -418,12 +433,12 @@ def _subsample(df: DataFrame) -> DataFrame:
Parameters
----------
- df: dataframe
+ df: pd.DataFrame
Dataset.
Returns
-------
- dataframe
+ pd.DataFrame
Subset of df.
"""
@@ -439,25 +454,36 @@ def _subsample(df: DataFrame) -> DataFrame:
else:
return df.iloc[sorted(random.sample(range(len(df)), k=n_rows))]
- def _set_index(df: DataFrame, y: Pandas | None) -> DataFrame:
+ def _set_index(
+ df: pd.DataFrame,
+ y: Pandas | None,
+ index: IndexSelector | None = None,
+ ) -> pd.DataFrame:
"""Assign an index to the dataframe.
Parameters
----------
- df: dataframe
+ df: pd.DataFrame
Dataset.
- y: series, dataframe or None
+ y: pd.Series, pd.DataFrame or None
Target column(s). Used to check that the provided index
is not one of the target columns. If None, the check is
skipped.
+ index: bool, int, str or sequence or None, default=None
+ Index parameter as provided in constructor. If None, the
+ index is retrieved from `self._config`.
+
Returns
-------
- dataframe
+ pd.DataFrame
Dataset with updated indices.
"""
+ if index is None:
+ index = self._config.index
+
if index is True: # True gets caught by isinstance(int)
pass
elif index is False:
@@ -494,9 +520,9 @@ def _set_index(df: DataFrame, y: Pandas | None) -> DataFrame:
return df
def _no_data_sets(
- X: DataFrame,
+ X: pd.DataFrame,
y: Pandas,
- ) -> tuple[DataContainer, DataFrame | None]:
+ ) -> tuple[DataContainer, pd.DataFrame | None]:
"""Generate data sets from one dataset.
Additionally, assigns an index, shuffles the data, selects
@@ -505,10 +531,10 @@ def _no_data_sets(
Parameters
----------
- X: dataframe
+ X: pd.DataFrame
Feature set with shape=(n_samples, n_features).
- y: series or dataframe
+ y: pd.Series or pd.DataFrame
Target column(s) corresponding to `X`.
Returns
@@ -516,7 +542,7 @@ def _no_data_sets(
DataContainer
Train and test sets.
- dataframe or None
+ pd.DataFrame or None
Holdout data set. Returns None if not specified.
"""
@@ -536,7 +562,7 @@ def _no_data_sets(
"Invalid value for the index parameter. Length of index "
f"({len(index)}) doesn't match that of the dataset ({len(data)})."
)
- data.index = index
+ data.index = pd.Index(index)
if len(data) < 5:
raise ValueError(
@@ -589,23 +615,22 @@ def _no_data_sets(
stratify=self._config.get_stratify_columns(data, y),
)
- complete_set = _set_index(bk.concat([train, test, holdout]), y)
+ complete_set = _set_index(pd.concat([train, test, holdout]), y, index)
container = DataContainer(
data=(data := complete_set.iloc[: len(data)]),
train_idx=data.index[:-len(test)],
test_idx=data.index[-len(test):],
- n_cols=len(get_cols(y)),
+ n_targets=n_cols(y),
)
except ValueError as ex:
# Clarify common error with stratification for multioutput tasks
- if "least populated class" in str(ex) and isinstance(y, dataframe_t):
+ if isinstance(y, pd.DataFrame):
raise ValueError(
"Stratification for multioutput tasks is applied over all target "
- "columns, which results in a least populated class that has only "
- "one member. Either select only one column to stratify over, or "
- "set the parameter stratify=False."
+ "columns. Either select only one column to stratify over, or set "
+ "the parameter stratify=False."
) from ex
else:
raise ex
@@ -616,13 +641,13 @@ def _no_data_sets(
return container, holdout
def _has_data_sets(
- X_train: DataFrame,
+ X_train: pd.DataFrame,
y_train: Pandas,
- X_test: DataFrame,
+ X_test: pd.DataFrame,
y_test: Pandas,
- X_holdout: DataFrame | None = None,
+ X_holdout: pd.DataFrame | None = None,
y_holdout: Pandas | None = None,
- ) -> tuple[DataContainer, DataFrame | None]:
+ ) -> tuple[DataContainer, pd.DataFrame | None]:
"""Generate data sets from provided sets.
Additionally, assigns an index, shuffles the data and
@@ -630,22 +655,22 @@ def _has_data_sets(
Parameters
----------
- X_train: dataframe
+ X_train: pd.DataFrame
Training set.
- y_train: series or dataframe
+ y_train: pd.Series or pd.DataFrame
Target column(s) corresponding to `X`_train.
- X_test: dataframe
+ X_test: pd.DataFrame
Test set.
- y_test: series or dataframe
+ y_test: pd.Series or pd.DataFrame
Target column(s) corresponding to `X`_test.
- X_holdout: dataframe or None
- Holdout set. Is None if not provided by the user.
+ X_holdout: pd.DataFrame or None, default=None
+ Holdout set. Can be None if not provided by the user.
- y_holdout: series, dataframe or None
+ y_holdout: pd.Series, pd.DataFrame or None, default=None
Target column(s) corresponding to `X`_holdout.
Returns
@@ -653,7 +678,7 @@ def _has_data_sets(
DataContainer
Train and test sets.
- dataframe or None
+ pd.DataFrame or None
Holdout data set. Returns None if not specified.
"""
@@ -696,18 +721,18 @@ def _has_data_sets(
"Invalid value for the index parameter. Length of index "
f"({len(index)}) doesn't match that of the data sets ({len_data})."
)
- train.index = index[: len(train)]
- test.index = index[len(train): len(train) + len(test)]
+ train.index = pd.Index(index[: len(train)])
+ test.index = pd.Index(index[len(train): len(train) + len(test)])
if holdout is not None:
- holdout.index = index[-len(holdout):]
+ holdout.index = pd.Index(index[-len(holdout):])
- complete_set = _set_index(bk.concat([train, test, holdout]), y_test)
+ complete_set = _set_index(pd.concat([train, test, holdout]), y_test, index)
container = DataContainer(
data=(data := complete_set.iloc[:len(train) + len(test)]),
train_idx=data.index[: len(train)],
test_idx=data.index[-len(test):],
- n_cols=len(get_cols(y_train)),
+ n_targets=n_cols(y_train),
)
if holdout is not None:
@@ -718,16 +743,16 @@ def _has_data_sets(
# Process input arrays ===================================== >>
if len(arrays) == 0:
- if self._goal.name == "forecast" and not isinstance(y, (*int_t, str)):
+ if self.branch._container:
+ return self.branch._data, self.branch._holdout
+ elif self._goal is Goal.forecast and not isinstance(y, (*int_t, str)):
# arrays=() and y=y for forecasting
sets = _no_data_sets(*self._check_input(y=y))
- elif not self.branch._container:
+ else:
raise ValueError(
"The data arrays are empty! Provide the data to run the pipeline "
"successfully. See the documentation for the allowed formats."
)
- else:
- return self.branch._data, self.branch._holdout
elif len(arrays) == 1:
# X or y for forecasting
@@ -787,7 +812,7 @@ def _has_data_sets(
if self._goal.name == "forecast":
# For forecasting, check if index complies with sktime's standard
valid, msg, _ = check_is_mtype(
- obj=pd.DataFrame(bk.concat([sets[0].data, sets[1]])),
+ obj=pd.DataFrame(pd.concat([sets[0].data, sets[1]])),
mtype="pd.DataFrame",
return_metadata=True,
var_name="the dataset",
@@ -797,7 +822,7 @@ def _has_data_sets(
raise ValueError(msg)
else:
# Else check for duplicate indices
- if bk.concat([sets[0].data, sets[1]]).index.duplicated().any():
+ if pd.concat([sets[0].data, sets[1]]).index.duplicated().any():
raise ValueError(
"Duplicate indices found in the dataset. "
"Try initializing atom using `index=False`."
@@ -1106,7 +1131,7 @@ def export_pipeline(self, model: str | Model | None = None) -> Pipeline:
def get_class_weight(
self,
rows: RowSelector = "train",
- ) -> dict[Hashable, float] | dict[str, dict[Hashable, float]]:
+ ) -> dict[Hashable, float] | dict[Hashable, dict[Hashable, float]]:
"""Return class weights for a balanced data set.
Statistically, the class weights re-balance the data set so
@@ -1128,12 +1153,12 @@ def get_class_weight(
"""
- def get_weights(col: Series) -> dict[Hashable, float]:
+ def get_weights(col: pd.Series) -> dict[Hashable, float]:
"""Get the class weights for one column.
Parameters
----------
- col: series
+ col: pd.Series
Column to get the weights from.
Returns
@@ -1147,14 +1172,14 @@ def get_weights(col: Series) -> dict[Hashable, float]:
_, y = self.branch._get_rows(rows, return_X_y=True)
- if self.task.is_multioutput:
- return {str(col.name): get_weights(col) for col in get_cols(y)}
- else:
+ if isinstance(y, pd.Series):
return get_weights(y)
+ else:
+ return {col.name: get_weights(col) for col in get_cols(y)}
@available_if(has_task("classification"))
@composed(crash, beartype)
- def get_sample_weight(self, rows: RowSelector = "train") -> Series:
+ def get_sample_weight(self, rows: RowSelector = "train") -> pd.Series:
"""Return sample weights for a balanced data set.
The returned weights are inversely proportional to the class
@@ -1169,13 +1194,13 @@ def get_sample_weight(self, rows: RowSelector = "train") -> Series:
Returns
-------
- series
+ pd.Series
Sequence of weights with shape=(n_samples,).
"""
_, y = self.branch._get_rows(rows, return_X_y=True)
weights = compute_sample_weight("balanced", y=y)
- return bk.Series(weights, name="sample_weight").round(3)
+ return pd.Series(weights, name="sample_weight").round(3)
@available_if(has_task("forecast"))
@composed(crash, beartype)
diff --git a/atom/basetrainer.py b/atom/basetrainer.py
index b88819538..3a74b1c49 100644
--- a/atom/basetrainer.py
+++ b/atom/basetrainer.py
@@ -12,15 +12,13 @@
from datetime import datetime as dt
from typing import Any
-import joblib
import mlflow
import numpy as np
-import ray
from joblib import Parallel, delayed
from optuna import Study, create_study
from atom.baserunner import BaseRunner
-from atom.branch import BranchManager
+from atom.data import BranchManager
from atom.data_cleaning import BaseTransformer
from atom.models import MODELS, CustomModel
from atom.plots import RunnerPlot
@@ -70,7 +68,7 @@ def __init__(
self._models = lst(models) if models is not None else ClassMap()
self._metric = lst(metric) if metric is not None else ClassMap()
- self._config = DataConfig()
+ self._config = DataConfig(index=self._goal is Goal.forecast)
self._branches = BranchManager(memory=self.memory)
self._n_trials = {}
@@ -374,14 +372,20 @@ def execute_model(m: Model) -> Model | None:
m.verbose = self.verbose
if self.backend == "ray":
+ import ray
+
# This implementation is more efficient than through joblib's
# ray backend. The difference is that in this one you start N
# tasks, and in the other, you start N actors and then have them
# each run the function
execute_remote = ray.remote(num_cpus=self.n_jobs)(execute_model)
models = ray.get([execute_remote.remote(m) for m in self._models])
+ elif self.backend == "dask":
+ import dask
+
+ models = dask.compute(*[dask.delayed(execute_model)(m) for m in self._models])
else:
- models = Parallel(n_jobs=self.n_jobs, backend=self.backend)(
+ models = Parallel(n_jobs=self.n_jobs)(
delayed(execute_model)(m) for m in self._models
)
@@ -391,8 +395,7 @@ def execute_model(m: Model) -> Model | None:
m.verbose = vb
else:
- with joblib.parallel_backend(backend=self.backend):
- models = [model for m in self._models if (model := execute_model(m))]
+ models = [model for m in self._models if (model := execute_model(m))]
self._models = ClassMap(m for m in models if m)
diff --git a/atom/basetransformer.py b/atom/basetransformer.py
index d5697754f..859d3f930 100644
--- a/atom/basetransformer.py
+++ b/atom/basetransformer.py
@@ -13,7 +13,6 @@
import tempfile
import warnings
from collections.abc import Hashable
-from copy import deepcopy
from datetime import datetime as dt
from importlib import import_module
from importlib.util import find_spec
@@ -22,26 +21,23 @@
from pathlib import Path
from typing import Literal, TypeVar, overload
-import dagshub
+import joblib
import mlflow
import numpy as np
-import ray
-import requests
+import pandas as pd
from beartype import beartype
-from dagshub.auth.token_auth import HTTPBearerAuth
from joblib.memory import Memory
from pandas._typing import Axes
-from ray.util.joblib import register_ray
from sklearn.utils.validation import check_memory
from atom.utils.types import (
- Backend, Bool, DataFrame, Engine, EngineDataOptions,
- EngineEstimatorOptions, EngineTuple, Estimator, FeatureNamesOut, Int,
- IntLargerEqualZero, Pandas, Sequence, Severity, Verbose, Warnings,
- XSelector, YSelector, bool_t, dataframe_t, int_t, sequence_t,
+ Backend, Bool, Engine, EngineDataOptions, EngineEstimatorOptions,
+ EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Pandas,
+ Severity, Verbose, Warnings, XReturn, XSelector, YReturn, YSelector,
+ bool_t, int_t,
)
from atom.utils.utils import (
- crash, flt, lst, make_sklearn, n_cols, to_df, to_pandas,
+ check_dependency, crash, lst, make_sklearn, to_df, to_tabular,
)
@@ -136,29 +132,18 @@ def engine(self, value: Engine):
data=value.get("data", EngineTuple().data),
estimator=value.get("estimator", EngineTuple().estimator),
)
- else:
- engine = value # type: ignore[assignment]
-
- if engine.data == "modin" and not ray.is_initialized():
- ray.init(
- runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_Pandas__": "1"}},
- log_to_driver=False,
- )
+ elif isinstance(value, EngineTuple):
+ engine = value
- # Update env variable to use for PandasModin in utils.py
- os.environ["ATOM_DATA_ENGINE"] = engine.data
+ # Make sure the data engine library is installed
+ check_dependency(engine.data_engine.library)
if engine.estimator == "sklearnex":
- if not find_spec("sklearnex"):
- raise ModuleNotFoundError(
- "Failed to import scikit-learn-intelex. The library is "
- "not installed. Note that the library only supports CPUs "
- "with a x86 architecture."
- )
- else:
- import sklearnex
+ check_dependency("sklearnex")
+ import sklearnex
+
+ sklearnex.set_config(self.device.lower() if self._gpu else "auto")
- sklearnex.set_config(self.device.lower() if self._gpu else "auto")
elif engine.estimator == "cuml":
if not find_spec("cuml"):
raise ModuleNotFoundError(
@@ -186,10 +171,25 @@ def backend(self) -> Backend:
@beartype
def backend(self, value: Backend):
if value == "ray":
+ check_dependency("ray")
+ import ray
+ from ray.util.joblib import register_ray
+
register_ray() # Register ray as joblib backend
if not ray.is_initialized():
ray.init(log_to_driver=False)
+ elif value == "dask":
+ check_dependency("dask")
+ from dask.distributed import Client
+
+ try:
+ Client.current()
+ except ValueError:
+ Client(processes=False)
+
+ joblib.parallel_config(backend=value)
+
self._backend = value
@property
@@ -299,6 +299,12 @@ def experiment(self, value: str | None):
self._experiment = value
if value:
if value.lower().startswith("dagshub:"):
+ check_dependency("dagshub")
+ check_dependency("requests")
+ import dagshub
+ import requests
+ from dagshub.auth.token_auth import HTTPBearerAuth
+
value = value[8:] # Drop dagshub:
token = dagshub.auth.get_token()
@@ -359,99 +365,24 @@ def _device_id(self) -> int:
# Methods ====================================================== >>
- def _inherit(
- self,
- obj: T_Estimator, fixed: tuple[str, ...] = (),
- feature_names_out: FeatureNamesOut = "one-to-one",
- ) -> T_Estimator:
- """Inherit parameters from parent.
-
- Utility method to set the sp (seasonal period), n_jobs and
- random_state parameters of an estimator (if available) equal
- to that of this instance. If `obj` is a meta-estimator, it
- also adjusts the parameters of the base estimator.
-
- Parameters
- ----------
- obj: Estimator
- Instance for which to change the parameters.
-
- fixed: tuple of str, default=()
- Fixed parameters that should not be overriden.
-
- feature_names_out: "one-to-one", callable or None, default="one-to-one"
- Determines the list of feature names that will be returned
- by the `get_feature_names_out` method.
-
- - If None: The `get_feature_names_out` method is not defined.
- - If "one-to-one": The output feature names will be equal to
- the input feature names.
- - If callable: Function that takes positional arguments self
- and a sequence of input feature names. It must return a
- sequence of output feature names.
-
- Returns
- -------
- Estimator
- Same object with changed parameters.
-
- """
- for p in obj.get_params():
- if p in fixed:
- continue
- elif match := re.search("^(n_jobs|random_state)$|__\1$", p):
- obj.set_params(**{p: getattr(self, match.group())})
- elif re.search(r"^sp$|__sp$", p) and hasattr(self, "_config") and self._config.sp:
- if self.multiple_seasonality:
- obj.set_params(**{p: self._config.sp.sp})
- else:
- obj.set_params(**{p: lst(self._config.sp.sp)[0]})
-
- return make_sklearn(obj, feature_names_out=feature_names_out)
-
- def _get_est_class(self, name: str, module: str) -> type[Estimator]:
- """Import a class from a module.
-
- When the import fails, for example, if atom uses sklearnex and
- that's passed to a transformer, use sklearn's (default engine).
-
- Parameters
- ----------
- name: str
- Name of the class to get.
-
- module: str
- Module from which to get the class.
-
- Returns
- -------
- Estimator
- Class of the estimator.
-
- """
- try:
- mod = import_module(f"{self.engine.estimator}.{module}")
- except (ModuleNotFoundError, AttributeError):
- mod = import_module(f"sklearn.{module}")
-
- return make_sklearn(getattr(mod, name))
-
@staticmethod
@overload
def _check_input(
X: XSelector,
y: Literal[None],
- columns: Axes,
- name: Literal[None],
- ) -> tuple[DataFrame, None]: ...
+ *,
+ columns: Axes | None = ...,
+ name: str | Axes | None = ...,
+ ) -> tuple[pd.DataFrame, None]: ...
@staticmethod
@overload
def _check_input(
X: Literal[None],
y: YSelector,
- columns: Literal[None],
- name: str | Sequence[str],
+ *,
+ columns: Axes | None = ...,
+ name: str | Axes | None = ...,
) -> tuple[None, Pandas]: ...
@staticmethod
@@ -459,134 +390,72 @@ def _check_input(
def _check_input(
X: XSelector,
y: YSelector,
+ *,
columns: Axes | None = ...,
- name: str | Sequence[str] | None = ...,
- ) -> tuple[DataFrame, Pandas]: ...
+ name: str | Axes | None = ...,
+ ) -> tuple[pd.DataFrame, Pandas]: ...
@staticmethod
def _check_input(
X: XSelector | None = None,
y: YSelector | None = None,
+ *,
columns: Axes | None = None,
- name: str | Sequence[str] | None = None,
- ) -> tuple[DataFrame | None, Pandas | None]:
+ name: str | Axes | None = None,
+ ) -> tuple[pd.DataFrame | None, Pandas | None]:
"""Prepare the input data.
- Convert X and y to pandas (if not already) and perform standard
- compatibility checks (dimensions, length, indices, etc...).
+ Convert X and y to pandas and perform standard compatibility
+ checks (dimensions, length, indices, etc...).
Parameters
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored.
+ `X` is ignored.
- y: int, str, dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: int, str, sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
+ - If None: `y` is ignored.
+ - If int: Position of the target column in `X`.
+ - If str: Name of the target column in `X`.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput
tasks.
- - If dataframe: Target columns for multioutput tasks.
+ - If dataframe-like: Target columns for multioutput tasks.
- columns: sequence or None, default=None
- Names of the features corresponding to `X`. If X already is a
- dataframe, force feature order. If None and X is not a
- dataframe, assign default feature names.
+ columns: sequence of str or None, default=None
+ Column names for the feature set. If None, default names
+ are used.
name: str, sequence or None, default=None
- Name of the target column(s) corresponding to y. If None and
- y is not a pandas object, assign default target name.
+ Name of the target column(s). If None, a default name is
+ used.
Returns
-------
- dataframe or None
- Feature dataset. Only returned if provided.
+ pd.DataFrame or None
+ Feature set.
- series, dataframe or None
- Target column corresponding to `X`.
+ pd.Series, pd.DataFrame or None
+ Target column(s) corresponding to `X`.
"""
- Xt: DataFrame | None = None
- yt: Pandas | None = None
-
if X is None and y is None:
raise ValueError("X and y can't be both None!")
- elif X is not None:
- Xt = to_df(deepcopy(X() if callable(X) else X), columns=columns)
-
- # If text dataset, change the name of the column to corpus
- if list(Xt.columns) == ["x0"] and Xt[Xt.columns[0]].dtype == "object":
- Xt = Xt.rename(columns={Xt.columns[0]: "corpus"})
- else:
- # Convert all column names to str
- Xt.columns = Xt.columns.astype(str)
-
- # No duplicate rows nor column names are allowed
- if Xt.columns.duplicated().any():
- raise ValueError("Duplicate column names found in X.")
-
- # Reorder columns to original order
- if columns is not None:
- try:
- Xt = Xt[list(columns)] # Force order determined by columns
- except KeyError:
- raise ValueError(
- f"The features are different than seen at fit time. "
- f"Features {set(Xt.columns) - set(columns)} are missing in X."
- ) from None
+ else:
+ Xt = to_df(X() if callable(X) else X, columns=columns)
# Prepare target column
- if isinstance(y, (dict, *sequence_t, *dataframe_t)):
- if isinstance(y, dict):
- yt = to_df(deepcopy(y), index=getattr(Xt, "index", None))
- if n_cols(yt) == 1:
- yt = yt.iloc[:, 0] # If y is one-dimensional, get series
-
- else:
- # If X and y have different number of rows, try multioutput
- if Xt is not None and len(Xt) != len(y):
- try:
- targets: list[Hashable] = []
- for col in y:
- if col in Xt.columns:
- targets.append(col)
- elif isinstance(col, int_t):
- if -Xt.shape[1] <= col < Xt.shape[1]:
- targets.append(Xt.columns[int(col)])
- else:
- raise IndexError(
- "Invalid value for the y parameter. Value "
- f"{col} is out of range for data with "
- f"{Xt.shape[1]} columns."
- )
-
- Xt, yt = Xt.drop(columns=targets), Xt[targets]
-
- except (TypeError, IndexError, KeyError):
- raise ValueError(
- "X and y don't have the same number of rows,"
- f" got len(X)={len(Xt)} and len(y)={len(y)}."
- ) from None
- else:
- yt = y
-
- default_cols = [f"y{i}" for i in range(n_cols(y))]
- yt = to_pandas(
- data=deepcopy(yt),
- index=getattr(Xt, "index", None),
- name=flt(name) if name is not None else "target",
- columns=name if isinstance(name, sequence_t) else default_cols,
- )
-
- # Check X and y have the same indices
- if Xt is not None and not Xt.index.equals(yt.index):
- raise ValueError("X and y don't have the same indices!")
+ yt: Pandas | None
+ if y is None:
+ yt = None
+ elif isinstance(y, int_t):
+ if Xt is None:
+ raise ValueError("X can't be None when y is an int.")
+ Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]]
elif isinstance(y, str):
if Xt is not None:
if y not in Xt.columns:
@@ -596,15 +465,148 @@ def _check_input(
else:
raise ValueError("X can't be None when y is a string.")
+ else:
+ # If X and y have different number of rows, try multioutput
+ if Xt is not None and not isinstance(y, dict) and len(Xt) != len(y):
+ try:
+ targets: list[Hashable] = []
+ for col in y:
+ if isinstance(col, str) and col in Xt.columns:
+ targets.append(col)
+ elif isinstance(col, int_t):
+ if -Xt.shape[1] <= col < Xt.shape[1]:
+ targets.append(Xt.columns[int(col)])
+ else:
+ raise IndexError(
+ "Invalid value for the y parameter. Value "
+ f"{col} is out of range for data with "
+ f"{Xt.shape[1]} columns."
+ )
+
+ Xt, yt = Xt.drop(columns=targets), Xt[targets]
+
+ except (TypeError, IndexError, KeyError):
+ raise ValueError(
+ "X and y don't have the same number of rows,"
+ f" got len(X)={len(Xt)} and len(y)={len(y)}."
+ ) from None
+ else:
+ yt = to_tabular(y, index=getattr(Xt, "index", None), columns=name)
- elif isinstance(y, int_t):
- if Xt is None:
- raise ValueError("X can't be None when y is an int.")
-
- Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]]
+ # Check X and y have the same indices
+ if Xt is not None and not Xt.index.equals(yt.index):
+ raise ValueError("X and y don't have the same indices!")
return Xt, yt
+ @overload
+ def _convert(self, obj: Literal[None]) -> None: ...
+
+ @overload
+ def _convert(self, obj: pd.DataFrame) -> XReturn: ...
+
+ @overload
+ def _convert(self, obj: pd.Series) -> YReturn: ...
+
+ def _convert(self, obj: Pandas | None) -> YReturn | None:
+ """Convert data to the type set in the data engine.
+
+ Non-pandas types are returned as is.
+
+ Parameters
+ ----------
+ obj: object
+ Object to convert.
+
+ Returns
+ -------
+ object
+ Converted data or unchanged object.
+
+ """
+ # Only apply transformations when the engine is defined
+ if hasattr(self, "_engine") and isinstance(obj, pd.Series | pd.DataFrame):
+ return self._engine.data_engine.convert(obj)
+ else:
+ return obj
+
+ def _get_est_class(self, name: str, module: str) -> type[Estimator]:
+ """Import a class from a module.
+
+ When the import fails, for example, if atom uses sklearnex and
+ that's passed to a transformer, use sklearn's (default engine).
+
+ Parameters
+ ----------
+ name: str
+ Name of the class to get.
+
+ module: str
+ Module from which to get the class.
+
+ Returns
+ -------
+ Estimator
+ Class of the estimator.
+
+ """
+ try:
+ mod = import_module(f"{self.engine.estimator}.{module}")
+ except (ModuleNotFoundError, AttributeError):
+ mod = import_module(f"sklearn.{module}")
+
+ return make_sklearn(getattr(mod, name))
+
+ def _inherit(
+ self,
+ obj: T_Estimator, fixed: tuple[str, ...] = (),
+ feature_names_out: FeatureNamesOut = "one-to-one",
+ ) -> T_Estimator:
+ """Inherit parameters from parent.
+
+ Utility method to set the sp (seasonal period), n_jobs and
+ random_state parameters of an estimator (if available) equal
+ to that of this instance. If `obj` is a meta-estimator, it
+ also adjusts the parameters of the base estimator.
+
+ Parameters
+ ----------
+ obj: Estimator
+ Instance for which to change the parameters.
+
+ fixed: tuple of str, default=()
+ Fixed parameters that should not be overriden.
+
+ feature_names_out: "one-to-one", callable or None, default="one-to-one"
+ Determines the list of feature names that will be returned
+ by the `get_feature_names_out` method.
+
+ - If None: The `get_feature_names_out` method is not defined.
+ - If "one-to-one": The output feature names will be equal to
+ the input feature names.
+ - If callable: Function that takes positional arguments self
+ and a sequence of input feature names. It must return a
+ sequence of output feature names.
+
+ Returns
+ -------
+ Estimator
+ Same object with changed parameters.
+
+ """
+ for p in obj.get_params():
+ if p in fixed:
+ continue
+ elif match := re.search("^(n_jobs|random_state)$|__\1$", p):
+ obj.set_params(**{p: getattr(self, match.group())})
+ elif re.search(r"^sp$|__sp$", p) and hasattr(self, "_config") and self._config.sp:
+ if self.multiple_seasonality:
+ obj.set_params(**{p: self._config.sp.sp})
+ else:
+ obj.set_params(**{p: lst(self._config.sp.sp)[0]})
+
+ return make_sklearn(obj, feature_names_out=feature_names_out)
+
@crash
def _log(self, msg: str, level: Int = 0, severity: Severity = "info"):
"""Print message and save to log file.
diff --git a/atom/branch/__init__.py b/atom/branch/__init__.py
deleted file mode 100644
index dd6f3adc1..000000000
--- a/atom/branch/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""Automated Tool for Optimized Modeling (ATOM).
-
-Author: Mavs
-Description: Module for branches.
-
-"""
-
-from atom.branch.branch import Branch
-from atom.branch.branchmanager import BranchManager
diff --git a/atom/data/__init__.py b/atom/data/__init__.py
new file mode 100644
index 000000000..236e72416
--- /dev/null
+++ b/atom/data/__init__.py
@@ -0,0 +1,10 @@
+"""Automated Tool for Optimized Modeling (ATOM).
+
+Author: Mavs
+Description: Module for branches.
+
+"""
+
+from atom.data.branch import Branch
+from atom.data.branchmanager import BranchManager
+from atom.data.dataengines import DATA_ENGINES
diff --git a/atom/branch/branch.py b/atom/data/branch.py
similarity index 84%
rename from atom/branch/branch.py
rename to atom/data/branch.py
index 8481386a8..d2f1f20b4 100644
--- a/atom/branch/branch.py
+++ b/atom/data/branch.py
@@ -15,6 +15,7 @@
from warnings import filterwarnings
import dill as pickle
+import pandas as pd
from beartype import beartype
from beartype.roar import BeartypeDecorHintPep585DeprecationWarning
from joblib.memory import Memory
@@ -22,12 +23,13 @@
from atom.pipeline import Pipeline
from atom.utils.types import (
- Bool, ColumnSelector, DataFrame, Index, Int, IntLargerEqualZero, Pandas,
- RowSelector, Scalar, Sequence, TargetSelector, TargetsSelector, XSelector,
- YSelector, dataframe_t, index_t, int_t, segment_t, series_t,
+ Bool, ColumnSelector, Int, IntLargerEqualZero, Pandas, RowSelector, Scalar,
+ TargetSelector, TargetsSelector, XConstructor, XDatasets, YConstructor,
+ YDatasets, int_t, segment_t,
)
from atom.utils.utils import (
- DataContainer, bk, flt, get_cols, lst, merge, to_pandas,
+ DataContainer, check_scaling, flt, get_col_names, get_cols, lst, merge,
+ to_tabular,
)
@@ -58,16 +60,16 @@ class Branch:
name: str
Name of the branch.
- memory: str, [Memory][joblibmemory] or None, default=None
- Memory object for pipeline caching and to store the data when
- the branch is inactive.
-
data: DataContainer or None, default=None
Data for the branch.
- holdout: dataframe or None, default=None
+ holdout: pd.DataFrame or None, default=None
Holdout data set.
+ memory: str, [Memory][joblibmemory] or None, default=None
+ Memory object for pipeline caching and to store the data when
+ the branch is inactive.
+
See Also
--------
atom.branch:BranchManager
@@ -98,12 +100,33 @@ class Branch:
"""
+ _shared_attrs = (
+ "pipeline",
+ "mapping",
+ "dataset",
+ "train",
+ "test",
+ "X",
+ "y",
+ "X_train",
+ "y_train",
+ "X_test",
+ "y_test",
+ "shape",
+ "columns",
+ "n_columns",
+ "features",
+ "n_features",
+ "target",
+ )
+
def __init__(
self,
name: str,
- memory: str | Memory | None = None,
data: DataContainer | None = None,
- holdout: DataFrame | None = None,
+ holdout: pd.DataFrame | None = None,
+ *,
+ memory: str | Memory | None = None,
):
self.name = name
self.memory = check_memory(memory)
@@ -161,14 +184,16 @@ def name(self, value: str):
# Data properties ============================================== >>
- def _check_setter(
- self,
- name: str,
- value: Sequence[Scalar | str] | XSelector,
- ) -> Pandas:
+ @overload
+ def _check_setter(self, name: XDatasets, value: YConstructor) -> pd.DataFrame: ...
+
+ @overload
+ def _check_setter(self, name: YDatasets, value: YConstructor) -> pd.Series: ...
+
+ def _check_setter(self, name: XDatasets | YDatasets, value: YConstructor) -> Pandas:
"""Check the data set's setter property.
- Convert the property to a pandas object and compare with the
+ Convert the property to a 'pandas' object and compare with the
rest of the dataset, to check if it has the right indices and
dimensions.
@@ -182,7 +207,7 @@ def _check_setter(
Returns
-------
- series or dataframe
+ pd.Series or pd.DataFrame
Data set.
"""
@@ -226,11 +251,13 @@ def counter(name: str, dim: str) -> str | None:
if under_name := counter(name, "under"):
under = getattr(self, under_name)
- obj = to_pandas(
+ if (columns := get_col_names(value)) is None:
+ columns = get_col_names(under) if under_name else None
+
+ obj = to_tabular(
data=value,
index=side.index if side_name else None,
- name=getattr(under, "name", "target") if under_name else "target",
- columns=getattr(under, "columns", None) if under_name else None,
+ columns=columns,
)
if side_name: # Check for equal rows
@@ -246,7 +273,7 @@ def counter(name: str, dim: str) -> str | None:
)
if under_name: # Check for equal columns
- if isinstance(obj, series_t):
+ if isinstance(obj, pd.Series):
if obj.name != under.name:
raise ValueError(
f"{name} and {under_name} must have the "
@@ -292,38 +319,38 @@ def mapping(self) -> dict[str, dict[Hashable, Scalar]]:
return self._mapping
@property
- def dataset(self) -> DataFrame:
+ def dataset(self) -> pd.DataFrame:
"""Complete data set."""
return self._data.data
@dataset.setter
- def dataset(self, value: XSelector):
+ def dataset(self, value: XConstructor):
self._data.data = self._check_setter("dataset", value)
@property
- def train(self) -> DataFrame:
+ def train(self) -> pd.DataFrame:
"""Training set."""
return self._data.data.loc[self._data.train_idx]
@train.setter
- def train(self, value: XSelector):
+ def train(self, value: XConstructor):
df = self._check_setter("train", value)
- self._data.data = bk.concat([df, self.test])
+ self._data.data = pd.concat([df, self.test])
self._data.train_idx = df.index
@property
- def test(self) -> DataFrame:
+ def test(self) -> pd.DataFrame:
"""Test set."""
return self._data.data.loc[self._data.test_idx]
@test.setter
- def test(self, value: XSelector):
+ def test(self, value: XConstructor):
df = self._check_setter("test", value)
- self._data.data = bk.concat([self.train, df])
+ self._data.data = pd.concat([self.train, df])
self._data.test_idx = df.index
@cached_property
- def holdout(self) -> DataFrame | None:
+ def holdout(self) -> pd.DataFrame | None:
"""Holdout set."""
if self._holdout is not None:
return merge(
@@ -336,12 +363,12 @@ def holdout(self) -> DataFrame | None:
return None
@property
- def X(self) -> DataFrame:
+ def X(self) -> pd.DataFrame:
"""Feature set."""
return self._data.data[self.features]
@X.setter
- def X(self, value: XSelector):
+ def X(self, value: XConstructor):
df = self._check_setter("X", value)
self._data.data = merge(df, self.y)
@@ -351,19 +378,19 @@ def y(self) -> Pandas:
return self._data.data[self.target]
@y.setter
- def y(self, value: YSelector):
+ def y(self, value: YConstructor):
series = self._check_setter("y", value)
self._data.data = merge(self.X, series)
@property
- def X_train(self) -> DataFrame:
+ def X_train(self) -> pd.DataFrame:
"""Features of the training set."""
return self.train[self.features]
@X_train.setter
- def X_train(self, value: XSelector):
+ def X_train(self, value: XConstructor):
df = self._check_setter("X_train", value)
- self._data.data = bk.concat([merge(df, self.y_train), self.test])
+ self._data.data = pd.concat([merge(df, self.y_train), self.test])
@property
def y_train(self) -> Pandas:
@@ -371,19 +398,19 @@ def y_train(self) -> Pandas:
return self.train[self.target]
@y_train.setter
- def y_train(self, value: YSelector):
+ def y_train(self, value: YConstructor):
series = self._check_setter("y_train", value)
- self._data.data = bk.concat([merge(self.X_train, series), self.test])
+ self._data.data = pd.concat([merge(self.X_train, series), self.test])
@property
- def X_test(self) -> DataFrame:
+ def X_test(self) -> pd.DataFrame:
"""Features of the test set."""
return self.test[self.features]
@X_test.setter
- def X_test(self, value: XSelector):
+ def X_test(self, value: XConstructor):
df = self._check_setter("X_test", value)
- self._data.data = bk.concat([self.train, merge(df, self.y_test)])
+ self._data.data = pd.concat([self.train, merge(df, self.y_test)])
@property
def y_test(self) -> Pandas:
@@ -391,9 +418,9 @@ def y_test(self) -> Pandas:
return self.test[self.target]
@y_test.setter
- def y_test(self, value: YSelector):
+ def y_test(self, value: YConstructor):
series = self._check_setter("y_test", value)
- self._data.data = bk.concat([self.train, merge(self.X_test, series)])
+ self._data.data = pd.concat([self.train, merge(self.X_test, series)])
@property
def shape(self) -> tuple[Int, Int]:
@@ -401,49 +428,61 @@ def shape(self) -> tuple[Int, Int]:
return self.dataset.shape
@property
- def columns(self) -> Index:
+ def columns(self) -> list[str]:
"""Name of all the columns."""
- return self.dataset.columns
+ return list(self.dataset.columns)
@property
- def n_columns(self) -> Int:
+ def n_columns(self) -> int:
"""Number of columns."""
return len(self.columns)
@property
- def features(self) -> Index:
+ def features(self) -> list[str]:
"""Name of the features."""
- return self.columns[:-self._data.n_cols]
+ return list(self.columns[:-self._data.n_targets])
@property
- def n_features(self) -> Int:
+ def n_features(self) -> int:
"""Number of features."""
return len(self.features)
@property
def target(self) -> str | list[str]:
"""Name of the target column(s)."""
- return flt(list(self.columns[-self._data.n_cols:]))
+ return flt(list(self.columns[-self._data.n_targets:]))
@property
- def _all(self) -> DataFrame:
+ def _all(self) -> pd.DataFrame:
"""Dataset + holdout.
Note that calling this property triggers the holdout set
calculation.
"""
- return bk.concat([self.dataset, self.holdout])
+ return pd.concat([self.dataset, self.holdout])
# Utility methods ============================================== >>
+ def _get_shared_attrs(self) -> list[str]:
+ """Get the attributes that can be accessed from a runner.
+
+ Returns
+ -------
+ list of str
+ Instance attributes.
+
+ """
+ instance_vars = [x for x in vars(self) if not x.startswith("_") and x.endswith("_")]
+ return list(self._shared_attrs) + instance_vars
+
@overload
def _get_rows(
self,
rows: RowSelector,
*,
return_X_y: Literal[False] = ...,
- ) -> DataFrame: ...
+ ) -> pd.DataFrame: ...
@overload
def _get_rows(
@@ -451,14 +490,14 @@ def _get_rows(
rows: RowSelector,
*,
return_X_y: Literal[True],
- ) -> tuple[DataFrame, Pandas]: ...
+ ) -> tuple[pd.DataFrame, Pandas]: ...
def _get_rows(
self,
rows: RowSelector,
*,
return_X_y: Bool = False,
- ) -> DataFrame | tuple[DataFrame, Pandas]:
+ ) -> pd.DataFrame | tuple[pd.DataFrame, Pandas]:
"""Get a subset of the rows.
Rows can be selected by name, index, data set or regex pattern.
@@ -479,10 +518,10 @@ def _get_rows(
Returns
-------
- dataframe
+ pd.DataFrame
Subset of rows.
- series or dataframe
+ pd.Series or pd.Dataframe
Subset of target column. Only returned if return_X_y=True.
"""
@@ -490,9 +529,9 @@ def _get_rows(
inc: list[Hashable] = []
exc: list[Hashable] = []
- if isinstance(rows, dataframe_t):
+ if isinstance(rows, pd.DataFrame):
inc.extend(rows.index)
- elif isinstance(rows, index_t):
+ elif isinstance(rows, pd.Index):
inc.extend(rows)
elif isinstance(rows, segment_t):
inc.extend(_all.index[rows])
@@ -541,10 +580,12 @@ def _get_rows(
# If rows were excluded with `!`, select all but those
inc = list(_all.index[~_all.index.isin(exc)])
+ rows_c = _all.loc[inc]
+
if return_X_y:
- return _all.loc[inc, self.features], _all.loc[inc, self.target] # type: ignore[index]
+ return rows_c[self.features], rows_c[self.target]
else:
- return self._all.loc[inc]
+ return rows_c
def _get_columns(
self,
@@ -590,7 +631,7 @@ def _get_columns(
return list(df.select_dtypes(include=["number"]).columns)
else:
return list(df.columns)
- elif isinstance(columns, dataframe_t):
+ elif isinstance(columns, pd.DataFrame):
inc.extend(list(columns.columns))
elif isinstance(columns, segment_t):
inc.extend(list(df.columns[columns]))
@@ -755,7 +796,7 @@ def get_class(
if only_columns and not isinstance(target, tuple):
return get_column(target)
elif isinstance(target, tuple):
- if not isinstance(self.y, dataframe_t):
+ if not isinstance(self.y, pd.DataFrame):
raise ValueError(
f"Invalid value for the target parameter, got {target}. "
"A tuple is only accepted for multioutput tasks."
@@ -831,3 +872,27 @@ def store(self, *, assign: Bool = True):
if assign:
self._container = None
+
+ def check_scaling(self) -> bool:
+ """Whether the feature set is scaled.
+
+ A data set is considered scaled when it has mean~0 and std~1,
+ or when there is a scaler in the pipeline. Categorical and
+ binary columns (only zeros and ones) are excluded from the
+ calculation.
+
+ Returns
+ -------
+ bool
+ Whether the feature set is scaled.
+
+ """
+ if any("scaler" in name.lower() for name in self.pipeline.named_steps):
+ return True
+
+ df = self.X.loc[:, (~self.X.isin([0, 1])).any(axis=0)] # Remove binary columns
+
+ if df.empty: # All columns are binary -> no scaling needed
+ return True
+ else:
+ return check_scaling(df)
diff --git a/atom/branch/branchmanager.py b/atom/data/branchmanager.py
similarity index 94%
rename from atom/branch/branchmanager.py
rename to atom/data/branchmanager.py
index 0d2a36f7d..66f7ed3de 100644
--- a/atom/branch/branchmanager.py
+++ b/atom/data/branchmanager.py
@@ -11,12 +11,13 @@
from collections.abc import Iterator
from copy import copy, deepcopy
+import pandas as pd
from beartype import beartype
from joblib.memory import Memory
from sklearn.utils.validation import check_memory
-from atom.branch.branch import Branch
-from atom.utils.types import Bool, DataFrame, Int
+from atom.data.branch import Branch
+from atom.utils.types import Bool, Int
from atom.utils.utils import ClassMap, DataContainer
@@ -99,7 +100,7 @@ def __repr__(self) -> str:
"""Print containing branches."""
return f"BranchManager([{', '.join(self.branches.keys())}], og={self.og.name})"
- def __len__(self) -> Int:
+ def __len__(self) -> int:
"""Get the number of branches in the manager."""
return len(self.branches)
@@ -212,9 +213,11 @@ def add(self, name: str, parent: Branch | None = None):
if parent:
self._copy_from_parent(self.current, parent)
- def fill(self, data: DataContainer, holdout: DataFrame | None = None):
+ def fill(self, data: DataContainer, holdout: pd.DataFrame | None = None):
"""Fill the current branch with data.
+ This call resets the cached holdout calculation.
+
Parameters
----------
data: DataContainer
@@ -225,7 +228,10 @@ def fill(self, data: DataContainer, holdout: DataFrame | None = None):
"""
self.current._container = data
- self.current._holdout = holdout
+ if holdout is not None:
+ self.current._holdout = holdout
+
+ self.current.__dict__.pop("holdout", None)
def reset(self, *, hard: Bool = False):
"""Reset this instance to its initial state.
diff --git a/atom/data/dataengines.py b/atom/data/dataengines.py
new file mode 100644
index 000000000..7d5d4500c
--- /dev/null
+++ b/atom/data/dataengines.py
@@ -0,0 +1,206 @@
+"""Automated Tool for Optimized Modeling (ATOM).
+
+Author: Mavs
+Description: Module containing the data engines.
+
+"""
+
+from __future__ import annotations
+
+from abc import ABCMeta, abstractmethod
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+
+from atom.utils.types import Any, Pandas
+
+
+if TYPE_CHECKING:
+ import dask.dataframe as dd
+ import modin.pandas as md
+ import polars as pl
+ import pyarrow as pa
+ import pyspark.pandas as ps
+ import pyspark.sql as psql
+
+
+class DataEngine(metaclass=ABCMeta):
+ """Abstract class for data engines.
+
+ Data engines convert a pandas object to a specific type.
+ The type is determined by the data engine.
+
+ """
+
+ @staticmethod
+ @abstractmethod
+ def convert(obj: Pandas) -> Any:
+ """Convert to data engine output types."""
+
+
+class NumpyEngine(DataEngine):
+ """Numpy data engine."""
+
+ library = "numpy"
+
+ @staticmethod
+ def convert(obj: Pandas) -> np.ndarray:
+ """Convert to numpy array."""
+ return obj.to_numpy()
+
+
+class PandasEngine(DataEngine):
+ """Pandas numpy data engine."""
+
+ library = "pandas"
+
+ @staticmethod
+ def convert(obj: Pandas) -> Pandas:
+ """Leave as is."""
+ return obj
+
+
+class PandasPyarrowEngine(DataEngine):
+ """Pandas pyarrow data engine."""
+
+ library = "pandas"
+
+ @staticmethod
+ def convert(obj: Pandas) -> Pandas:
+ """Convert to pyarrow dtypes."""
+ from pyarrow import from_numpy_dtype
+
+ if isinstance(obj, pd.DataFrame):
+ return obj.astype(
+ {
+ c: pd.ArrowDtype(from_numpy_dtype(getattr(d, "numpy_dtype", d)))
+ for c, d in obj.dtypes.items()
+ }
+ )
+ else:
+ return obj.astype(
+ pd.ArrowDtype(from_numpy_dtype(obj.dtype))
+ if isinstance(obj.dtype, np.dtype) else obj.dtype
+ )
+
+
+class PolarsEngine(DataEngine):
+ """Polars data engine."""
+
+ library = "polars"
+
+ @staticmethod
+ def convert(obj: Pandas) -> pl.Series | pl.DataFrame:
+ """Convert to polars objects."""
+ import polars as pl
+
+ if isinstance(obj, pd.DataFrame):
+ return pl.DataFrame(obj)
+ else:
+ return pl.Series(obj)
+
+
+class PolarsLazyEngine(DataEngine):
+ """Polars lazy data engine."""
+
+ library = "polars"
+
+ @staticmethod
+ def convert(obj: Pandas) -> pl.Series | pl.LazyFrame:
+ """Convert to lazy polars objects."""
+ import polars as pl
+
+ if isinstance(obj, pd.DataFrame):
+ return pl.LazyFrame(obj)
+ else:
+ return pl.Series(obj)
+
+
+class PyArrowEngine(DataEngine):
+ """PyArrow data engine."""
+
+ library = "pyarrow"
+
+ @staticmethod
+ def convert(obj: Pandas) -> pa.Array | pa.Table:
+ """Convert to pyarrow objects."""
+ import pyarrow as pa
+
+ if isinstance(obj, pd.DataFrame):
+ return pa.Table.from_pandas(obj)
+ else:
+ return pa.Array.from_pandas(obj)
+
+
+class ModinEngine(DataEngine):
+ """Modin data engine."""
+
+ library = "modin"
+
+ @staticmethod
+ def convert(obj: Pandas) -> md.Series | md.DataFrame:
+ """Convert to modin objects."""
+ import modin.pandas as md
+
+ if isinstance(obj, pd.DataFrame):
+ return md.DataFrame(obj)
+ else:
+ return md.Series(obj)
+
+
+class DaskEngine(DataEngine):
+ """Dask data engine."""
+
+ library = "dask"
+
+ @staticmethod
+ def convert(obj: Pandas) -> dd.Series | dd.DataFrame:
+ """Convert to dask objects."""
+ import dask.dataframe as dd
+
+ return dd.from_pandas(obj, npartitions=int(max(1, len(obj) // 1e6)))
+
+
+class PySparkEngine(DataEngine):
+ """PySpark data engine."""
+
+ library = "pyspark"
+
+ @staticmethod
+ def convert(obj: Pandas) -> psql.DataFrame:
+ """Convert to pyspark objects."""
+ from pyspark.sql import SparkSession
+
+ spark = SparkSession.builder.appName("atom-ml").getOrCreate()
+ return spark.createDataFrame(obj)
+
+
+class PySparkPandasEngine(DataEngine):
+ """PySpark data engine with pandas API."""
+
+ library = "pyspark"
+
+ @staticmethod
+ def convert(obj: Pandas) -> ps.Series | ps.DataFrame:
+ """Convert to pyspark objects."""
+ import pyspark.pandas as ps
+
+ if isinstance(obj, pd.DataFrame):
+ return ps.DataFrame(obj)
+ else:
+ return ps.Series(obj)
+
+
+DATA_ENGINES = {
+ "numpy": NumpyEngine,
+ "pandas": PandasEngine,
+ "pandas-pyarrow": PandasPyarrowEngine,
+ "polars": PolarsEngine,
+ "polars-lazy": PolarsLazyEngine,
+ "pyarrow": PyArrowEngine,
+ "modin": ModinEngine,
+ "dask": DaskEngine,
+ "pyspark": PySparkEngine,
+ "pyspark-pandas": PySparkPandasEngine,
+}
diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py
index 165e31475..2861c0326 100644
--- a/atom/data_cleaning.py
+++ b/atom/data_cleaning.py
@@ -10,8 +10,7 @@
import re
from collections import defaultdict
from collections.abc import Hashable
-from typing import Any, Literal, TypeVar
-from unittest.mock import patch
+from typing import Any, Literal, TypeVar, overload
import numpy as np
import pandas as pd
@@ -40,33 +39,31 @@
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer # noqa: F401
from sklearn.impute import IterativeImputer, KNNImputer
-from sklearn.utils._set_output import _SetOutputMixin
from sklearn.utils.validation import _check_feature_names_in
from sktime.transformations.series.detrend import (
ConditionalDeseasonalizer, Deseasonalizer, Detrender,
)
-from sktime.transformations.series.impute import Imputer as sktimeImputer
+from sktime.transformations.series.impute import Imputer as SktimeImputer
from typing_extensions import Self
from atom.basetransformer import BaseTransformer
from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING
-from atom.utils.patches import wrap_method_output
from atom.utils.types import (
- Bins, Bool, CategoricalStrats, DataFrame, DiscretizerStrats, Engine,
- EngineTuple, Estimator, FloatLargerZero, Int, IntLargerEqualZero,
- IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats,
- Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels,
- Sequence, Series, Transformer, Verbose, XConstructor, YConstructor,
- dataframe_t, sequence_t, series_t,
+ Bins, Bool, CategoricalStrats, DiscretizerStrats, Engine,
+ EngineDataOptions, EngineTuple, Estimator, FloatLargerZero, Int,
+ IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats,
+ NumericalStrats, Predictor, PrunerStrats, Scalar, ScalerStrats,
+ SeasonalityModels, Sequence, Transformer, Verbose, XConstructor, XReturn,
+ YConstructor, YReturn, sequence_t,
)
from atom.utils.utils import (
- Goal, bk, check_is_fitted, composed, crash, get_col_order, get_cols, it,
- lst, make_sklearn, merge, method_to_log, n_cols, replace_missing, sign,
- to_df, to_series, variable_return, wrap_transformer_methods,
+ Goal, check_is_fitted, get_col_names, get_col_order, get_cols, it, lst,
+ make_sklearn, merge, n_cols, replace_missing, sign, to_df, to_series,
+ to_tabular, variable_return,
)
-T = TypeVar("T", bound=Transformer)
+T_Transformer = TypeVar("T_Transformer", bound=Transformer)
@beartype
@@ -77,21 +74,12 @@ class TransformerMixin(BaseEstimator, BaseTransformer):
- Accounts for the transformation of y.
- Always add a fit method.
- - Wraps the fit method with a data check.
- - Wraps transforming methods with fit and data check.
+ - Wraps the fit method with attributes and a data check.
+ - Wraps transforming methods a data check.
- Maintains internal attributes when cloned.
"""
- def __init_subclass__(cls, **kwargs):
- """Wrap transformer methods to apply data and fit check."""
- for k in ("fit", "transform", "inverse_transform"):
- setattr(cls, k, wrap_transformer_methods(getattr(cls, k)))
-
- # Patch to avoid errors for transformers that allow passing only y
- with patch("sklearn.utils._set_output._wrap_method_output", wrap_method_output):
- super().__init_subclass__(**kwargs)
-
def __repr__(self, N_CHAR_MAX: Int = 700) -> str:
"""Drop named tuples if default parameters from string representation."""
out = super().__repr__(N_CHAR_MAX)
@@ -107,7 +95,7 @@ def __repr__(self, N_CHAR_MAX: Int = 700) -> str:
return out
- def __sklearn_clone__(self: T) -> T:
+ def __sklearn_clone__(self: T_Transformer) -> T_Transformer:
"""Wrap cloning method to attach internal attributes."""
cloned = _clone_parametrized(self)
@@ -117,7 +105,6 @@ def __sklearn_clone__(self: T) -> T:
return cloned
- @composed(crash, method_to_log)
def fit(
self,
X: XConstructor | None = None,
@@ -132,20 +119,11 @@ def fit(
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored.
-
- y: int, str, sequence, dataframe-like or None, default=None
- Target column corresponding to `X`.
+ `X` is ignored.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe-like: Target columns with shape=(n_samples,
- n_targets) for multioutput tasks.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`. If None, `y` is
+ ignored.
**fit_params
Additional keyword arguments for the fit method.
@@ -156,37 +134,56 @@ def fit(
Estimator instance.
"""
+ Xt = to_df(X)
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
self._log(f"Fitting {self.__class__.__name__}...", 1)
return self
- @composed(crash, method_to_log)
+ @overload
+ def fit_transform(
+ self,
+ X: Literal[None],
+ y: YConstructor,
+ **fit_params,
+ ) -> YReturn: ...
+
+ @overload
+ def fit_transform(
+ self,
+ X: XConstructor,
+ y: Literal[None] = ...,
+ **fit_params,
+ ) -> XReturn: ...
+
+ @overload
+ def fit_transform(
+ self,
+ X: XConstructor,
+ y: YConstructor,
+ **fit_params,
+ ) -> tuple[XReturn, YReturn]: ...
+
def fit_transform(
self,
X: XConstructor | None = None,
y: YConstructor | None = None,
**fit_params,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Fit to data, then transform it.
Parameters
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored.
-
- y: int, str, sequence, dataframe-like or None, default=None
- Target column corresponding to `X`.
+ `X` is ignored.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe-like: Target columns with shape=(n_samples,
- n_targets) for multioutput tasks.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`. If None, `y` is
+ ignored.
**fit_params
Additional keyword arguments for the fit method.
@@ -202,12 +199,36 @@ def fit_transform(
"""
return self.fit(X, y, **fit_params).transform(X, y)
- @composed(crash, method_to_log)
+ @overload
+ def inverse_transform(
+ self,
+ X: Literal[None],
+ y: YConstructor,
+ **fit_params,
+ ) -> YReturn: ...
+
+ @overload
+ def inverse_transform(
+ self,
+ X: XConstructor,
+ y: Literal[None] = ...,
+ **fit_params,
+ ) -> XReturn: ...
+
+ @overload
+ def inverse_transform(
+ self,
+ X: XConstructor,
+ y: YConstructor,
+ **fit_params,
+ ) -> tuple[XReturn, YReturn]: ...
+
def inverse_transform(
self,
X: XConstructor | None = None,
y: YConstructor | None = None,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ **fit_params,
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Do nothing.
Returns the input unchanged. Implemented for continuity of the
@@ -217,20 +238,11 @@ def inverse_transform(
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored.
+ `X` is ignored.
- y: int, str, sequence, dataframe-like or None, default=None
- Target column corresponding to `X`.
-
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe-like: Target columns with shape=(n_samples,
- n_targets) for multioutput tasks.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`. If None, `y` is
+ ignored.
Returns
-------
@@ -238,14 +250,58 @@ def inverse_transform(
Feature set. Only returned if provided.
series or dataframe
- Target column. Only returned if provided.
+ Target column(s). Only returned if provided.
"""
- return variable_return(X, y)
+ check_is_fitted(self)
+
+ Xt = to_df(X)
+ yt = to_tabular(y, index=getattr(Xt, "index", None))
+
+ return variable_return(self._convert(Xt), self._convert(yt))
+
+ def set_output(self, *, transform: EngineDataOptions | None = None) -> Self:
+ """Set output container.
+
+ See sklearn's [user guide][set_output] on how to use the
+ `set_output` API. See [here][data-acceleration] a description
+ of the choices.
+
+ Parameters
+ ----------
+ transform: str or None, default=None
+ Configure the output of the `transform`, `fit_transform`,
+ and `inverse_transform` method. If None, the configuration
+ is not changed. Choose from:
+
+ - "numpy"
+ - "pandas" (default)
+ - "pandas-pyarrow"
+ - "polars"
+ - "polars-lazy"
+ - "pyarrow"
+ - "modin"
+ - "dask"
+ - "pyspark"
+ - "pyspark-pandas"
+
+ Returns
+ -------
+ Self
+ Estimator instance.
+
+ """
+ if not hasattr(self, "_engine"):
+ self.engine = EngineTuple()
+
+ if transform is not None:
+ self.engine = EngineTuple(estimator=self.engine.estimator, data=transform)
+
+ return self
@beartype
-class Balancer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
+class Balancer(TransformerMixin, OneToOneFeatureMixin):
"""Balance the number of samples per class in the target column.
When oversampling, the newly created samples have an increasing
@@ -364,8 +420,23 @@ def __init__(
self.strategy = strategy
self.kwargs = kwargs
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame, y: Pandas = -1) -> Self:
+ def _log_changes(self, y: pd.Series):
+ """Print the changes per target class.
+
+ Parameters
+ ----------
+ y: pd.Series
+ Target column.
+
+ """
+ for key, value in self.mapping_.items():
+ diff = self._counts[key] - np.sum(y == value)
+ if diff > 0:
+ self._log(f" --> Removing {diff} samples from class {key}.", 2)
+ elif diff < 0:
+ self._log(f" --> Adding {-diff} samples to class {key}.", 2)
+
+ def fit(self, X: XConstructor, y: YConstructor) -> Self:
"""Fit to data.
Parameters
@@ -373,31 +444,28 @@ def fit(self, X: DataFrame, y: Pandas = -1) -> Self:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, dict or sequence, default=-1
+ y: sequence
Target column corresponding to `X`.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe: Target columns for multioutput tasks.
-
Returns
-------
Self
Estimator instance.
"""
- if isinstance(y, series_t):
- self.target_names_in_ = np.array([y.name])
+ Xt = to_df(X)
+ yt = to_tabular(y, index=Xt.index)
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
+ if isinstance(yt, pd.Series):
+ self.target_names_in_ = np.array([yt.name])
else:
raise ValueError("The Balancer class does not support multioutput tasks.")
+ # ClusterCentroids is unavailable since it has no sample_indices_
strategies = {
- # clustercentroids=ClusterCentroids, # noqa: ERA001 (has no sample_indices_)
"condensednearestneighbour": CondensedNearestNeighbour,
"editednearestneighborus": EditedNearestNeighbours,
"repeatededitednearestneighbours": RepeatedEditedNearestNeighbours,
@@ -440,21 +508,20 @@ def fit(self, X: DataFrame, y: Pandas = -1) -> Self:
# Create dict of class counts in y
if not hasattr(self, "mapping_"):
- self.mapping_ = {str(v): v for v in y.sort_values().unique()}
+ self.mapping_ = {str(v): v for v in yt.sort_values().unique()}
self._counts = {}
for key, value in self.mapping_.items():
- self._counts[key] = np.sum(y == value)
+ self._counts[key] = np.sum(yt == value)
- self._estimator = estimator.fit(X, y)
+ self._estimator = estimator.fit(Xt, yt)
# Add the estimator as attribute to the instance
setattr(self, f"{estimator.__class__.__name__.lower()}_", self._estimator)
return self
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas = -1) -> tuple[DataFrame, Series]:
+ def transform(self, X: XConstructor, y: YConstructor) -> tuple[XReturn, YReturn]:
"""Balance the data.
Parameters
@@ -462,13 +529,9 @@ def transform(self, X: DataFrame, y: Pandas = -1) -> tuple[DataFrame, Series]:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str or sequence, default=-1
+ y: sequence
Target column corresponding to `X`.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - Else: Array with shape=(n_samples,) to use as target.
-
Returns
-------
dataframe
@@ -478,79 +541,74 @@ def transform(self, X: DataFrame, y: Pandas = -1) -> tuple[DataFrame, Series]:
Transformed target column.
"""
+ check_is_fitted(self)
- def log_changes(y):
- """Print the changes per target class."""
- for key, value in self.mapping_.items():
- diff = self._counts[key] - np.sum(y == value)
- if diff > 0:
- self._log(f" --> Removing {diff} samples from class {key}.", 2)
- elif diff < 0:
- self._log(f" --> Adding {-diff} samples to class {key}.", 2)
+ Xt = to_df(X, columns=self.feature_names_in_)
+ yt = to_series(y, index=Xt.index, name=self.target_names_in_[0]) # type: ignore[arg-type]
if "over_sampling" in self._estimator.__module__:
self._log(f"Oversampling with {self._estimator.__class__.__name__}...", 1)
- index = X.index # Save indices for later reassignment
- X, y = self._estimator.fit_resample(X, y)
+ index = Xt.index # Save indices for later reassignment
+ Xt, yt = self._estimator.fit_resample(Xt, yt)
# Create indices for the new samples
n_idx: list[int | str]
if index.dtype.kind in "ifu":
- n_idx = list(range(max(index) + 1, max(index) + len(X) - len(index) + 1))
+ n_idx = list(range(max(index) + 1, max(index) + len(Xt) - len(index) + 1))
else:
n_idx = [
f"{self._estimator.__class__.__name__.lower()}_{i}"
- for i in range(1, len(X) - len(index) + 1)
+ for i in range(1, len(Xt) - len(index) + 1)
]
# Assign the old + new indices
- X.index = list(index) + list(n_idx)
- y.index = list(index) + list(n_idx)
+ Xt.index = pd.Index(list(index) + n_idx)
+ yt.index = pd.Index(list(index) + n_idx)
- log_changes(y)
+ self._log_changes(yt)
elif "under_sampling" in self._estimator.__module__:
self._log(f"Undersampling with {self._estimator.__class__.__name__}...", 1)
- self._estimator.fit_resample(X, y)
+ self._estimator.fit_resample(Xt, yt)
# Select chosen rows (imblearn doesn't return them in order)
- samples = sorted(self._estimator.sample_indices_)
- X, y = X.iloc[samples], y.iloc[samples] # type: ignore[call-overload]
+ samples = np.asarray(sorted(self._estimator.sample_indices_))
+ Xt, yt = Xt.iloc[samples], yt.iloc[samples]
- log_changes(y)
+ self._log_changes(yt)
elif "combine" in self._estimator.__module__:
self._log(f"Balancing with {self._estimator.__class__.__name__}...", 1)
- index = X.index
- X_new, y_new = self._estimator.fit_resample(X, y)
+ index = Xt.index
+ X_new, y_new = self._estimator.fit_resample(Xt, yt)
# Select rows kept by the undersampler
if self._estimator.__class__.__name__ == "SMOTEENN":
- samples = sorted(self._estimator.enn_.sample_indices_)
+ samples = np.asarray(sorted(self._estimator.enn_.sample_indices_))
elif self._estimator.__class__.__name__ == "SMOTETomek":
- samples = sorted(self._estimator.tomek_.sample_indices_)
+ samples = np.asarray(sorted(self._estimator.tomek_.sample_indices_))
# Select the remaining samples from the old dataframe
- o_samples = [s for s in samples if s < len(X)]
- X, y = X.iloc[o_samples], y.iloc[o_samples] # type: ignore[call-overload]
+ o_samples = [s for s in samples if s < len(Xt)]
+ Xt, yt = Xt.iloc[o_samples], yt.iloc[o_samples] # type: ignore[call-overload]
# Create indices for the new samples
if index.dtype.kind in "ifu":
- n_idx = list(range(max(index) + 1, max(index) + len(X_new) - len(X) + 1))
+ n_idx = list(range(max(index) + 1, max(index) + len(X_new) - len(Xt) + 1))
else:
n_idx = [
f"{self._estimator.__class__.__name__.lower()}_{i}"
- for i in range(1, len(X_new) - len(X) + 1)
+ for i in range(1, len(X_new) - len(Xt) + 1)
]
# Select the new samples and assign the new indices
X_new = X_new.iloc[-len(X_new) + len(o_samples):]
- X_new.index = n_idx
+ X_new.index = pd.Index(n_idx)
y_new = y_new.iloc[-len(y_new) + len(o_samples):]
- y_new.index = n_idx
+ y_new.index = pd.Index(n_idx)
# First, output the samples created
for key, value in self.mapping_.items():
@@ -559,17 +617,17 @@ def log_changes(y):
# Then, output the samples dropped
for key, value in self.mapping_.items():
- if (diff := self._counts[key] - np.sum(y == value)) > 0:
+ if (diff := self._counts[key] - np.sum(yt == value)) > 0:
self._log(f" --> Removing {diff} samples from class: {key}.", 2)
# Add the new samples to the old dataframe
- X, y = bk.concat([X, X_new]), bk.concat([y, y_new])
+ Xt, yt = pd.concat([Xt, X_new]), pd.concat([yt, y_new])
- return X, y
+ return self._convert(Xt), self._convert(yt)
@beartype
-class Cleaner(TransformerMixin, _SetOutputMixin):
+class Cleaner(TransformerMixin):
"""Applies standard data cleaning steps on a dataset.
Use the parameters to choose which transformations to perform.
@@ -623,24 +681,12 @@ class Cleaner(TransformerMixin, _SetOutputMixin):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: str, dict or None, default=None
- Execution engine to use for [data][data-acceleration] and
- [estimators][estimator-acceleration]. The value should be
- one of the possible values to change one of the two engines,
- or a dictionary with keys `data` and `estimator`, with their
- corresponding choice as values to change both engines. If
- None, the default values are used. Choose from:
+ engine: str or None, default=None
+ Execution engine to use for [estimators][estimator-acceleration].
+ If None, the default value is used. Choose from:
- - "data":
-
- - "pandas" (default)
- - "pyarrow"
- - "modin"
-
- - "estimator":
-
- - "sklearn" (default)
- - "cuml"
+ - "sklearn" (default)
+ - "cuml"
verbose: int, default=0
Verbosity level of the class. Choose from:
@@ -733,27 +779,17 @@ def __init__(
self.drop_missing_target = drop_missing_target
self.encode_target = encode_target
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self:
+ def fit(self, X: XConstructor | None = None, y: YConstructor | None = None) -> Self:
"""Fit to data.
Parameters
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored.
+ `X` is ignored.
- y: int, str, dict, sequence, dataframe-like or None, default=None
- Target column corresponding to `X`.
-
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe: Target columns for multioutput tasks.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
Returns
-------
@@ -761,7 +797,14 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self:
Estimator instance.
"""
+ Xt = to_df(X)
+ yt = to_tabular(y, index=getattr(Xt, "index", None))
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
self.mapping_: dict[str, Any] = {}
+ self.target_names_in_ = np.array([])
self._drop_cols = []
self._estimators = {}
@@ -770,26 +813,23 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self:
self._log("Fitting Cleaner...", 1)
- if X is not None and self.drop_dtypes is not None:
- self._drop_cols = list(X.select_dtypes(include=lst(self.drop_dtypes)).columns)
+ if Xt is not None and self.drop_dtypes is not None:
+ self._drop_cols = list(Xt.select_dtypes(include=lst(self.drop_dtypes)).columns)
- if y is not None:
- if isinstance(y, series_t):
- self.target_names_in_ = np.array([y.name])
- else:
- self.target_names_in_ = y.columns.to_numpy()
+ if yt is not None:
+ self.target_names_in_ = np.array(get_col_names(yt))
if self.drop_chars:
- if isinstance(y, series_t):
- y.name = re.sub(self.drop_chars, "", str(y.name))
+ if isinstance(yt, pd.DataFrame):
+ yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1)
else:
- y = y.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1)
+ yt.name = re.sub(self.drop_chars, "", str(yt.name))
if self.drop_missing_target:
- y = replace_missing(y, self.missing_).dropna(axis=0)
+ yt = replace_missing(yt, self.missing_).dropna(axis=0)
if self.encode_target:
- for col in get_cols(y):
+ for col in get_cols(yt):
if isinstance(col.iloc[0], sequence_t): # Multilabel
MultiLabelBinarizer = self._get_est_class(
name="MultiLabelBinarizer",
@@ -799,7 +839,9 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self:
elif list(uq := np.unique(col)) != list(range(col.nunique())):
LabelEncoder = self._get_est_class("LabelEncoder", "preprocessing")
self._estimators[col.name] = LabelEncoder().fit(col)
- self.mapping_.update({col.name: {str(it(v)): i for i, v in enumerate(uq)}})
+ self.mapping_.update(
+ {str(col.name): {str(it(v)): i for i, v in enumerate(uq)}}
+ )
return self
@@ -829,31 +871,21 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) ->
return np.array(columns)
- @composed(crash, method_to_log)
def transform(
self,
- X: DataFrame | None = None,
- y: Pandas | None = None,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ X: XConstructor | None = None,
+ y: YConstructor | None = None,
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Apply the data cleaning steps to the data.
Parameters
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored.
+ `X` is ignored.
- y: int, str, dict, sequence, dataframe-like or None, default=None
- Target column corresponding to `X`.
-
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe: Target columns for multioutput tasks.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
Returns
-------
@@ -864,95 +896,98 @@ def transform(
Transformed target column. Only returned if provided.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=getattr(self, "feature_names_in_", None))
+ yt = to_tabular(y, index=getattr(Xt, "index", None), columns=self.target_names_in_)
+
self._log("Cleaning the data...", 1)
- if X is not None:
+ if Xt is not None:
# Unify all missing values
- X = replace_missing(X, self.missing_)
+ Xt = replace_missing(Xt, self.missing_)
- for name, column in X.items():
+ for name, column in Xt.items():
# Drop features with an invalid data type
if name in self._drop_cols:
self._log(
f" --> Dropping feature {name} for "
f"having type: {column.dtype.name}.", 2,
)
- X = X.drop(columns=name)
+ Xt = Xt.drop(columns=name)
elif column.dtype.name in CAT_TYPES:
if self.strip_categorical:
# Strip strings from blank spaces
- X[name] = column.apply(
+ Xt[name] = column.apply(
lambda val: val.strip() if isinstance(val, str) else val
)
# Drop prohibited chars from column names
if self.drop_chars:
- X = X.rename(columns=lambda x: re.sub(self.drop_chars, "", str(x)))
+ Xt = Xt.rename(columns=lambda x: re.sub(self.drop_chars, "", str(x)))
# Drop duplicate samples
if self.drop_duplicates:
- X = X.drop_duplicates(ignore_index=True)
+ Xt = Xt.drop_duplicates(ignore_index=True)
if self.convert_dtypes:
- X = X.convert_dtypes()
+ Xt = Xt.convert_dtypes()
- if y is not None:
+ if yt is not None:
if self.drop_chars:
- if isinstance(y, series_t):
- y.name = re.sub(self.drop_chars, "", str(y.name))
+ if isinstance(y, pd.Series):
+ yt.name = re.sub(self.drop_chars, "", str(yt.name))
else:
- y = y.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1)
+ yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1)
# Delete samples with missing values in target
if self.drop_missing_target:
- length = len(y) # Save original length to count deleted rows later
- y = replace_missing(y, self.missing_).dropna()
+ length = len(yt) # Save original length to count deleted rows later
+ yt = replace_missing(yt, self.missing_).dropna()
- if X is not None:
- X = X[X.index.isin(y.index)] # Select only indices that remain
+ if Xt is not None:
+ Xt = Xt[Xt.index.isin(yt.index)] # Select only indices that remain
- if (d := length - len(y)) > 0:
+ if (d := length - len(yt)) > 0:
self._log(f" --> Dropping {d} rows with missing values in target.", 2)
if self.encode_target and self._estimators:
- yt = y.__class__(dtype="object")
- for col in get_cols(y):
+ y_new = yt.__class__(dtype="object")
+ for col in get_cols(yt):
if est := self._estimators.get(col.name):
if n_cols(out := est.transform(col)) == 1:
self._log(f" --> Label-encoding column {col.name}.", 2)
- out = to_series(out, y.index, col.name)
-
+ out = to_series(out, yt.index, str(col.name))
else:
self._log(f" --> Label-binarizing column {col.name}.", 2)
out = to_df(
data=out,
- index=y.index,
+ index=yt.index,
columns=[f"{col.name}_{c}" for c in est.classes_],
)
# Replace target with encoded column(s)
- if isinstance(y, series_t):
- yt = out
+ if isinstance(yt, pd.Series):
+ y_new = out
else:
- yt = merge(yt, out)
+ y_new = merge(y_new, out)
else: # Add unchanged column
- yt = merge(yt, col)
+ y_new = merge(y_new, col)
- y = yt
+ yt = y_new
if self.convert_dtypes:
- y = y.convert_dtypes()
+ yt = yt.convert_dtypes()
- return variable_return(X, y)
+ return variable_return(self._convert(Xt), self._convert(yt))
- @composed(crash, method_to_log)
def inverse_transform(
self,
- X: DataFrame | None = None,
- y: Pandas | None = None,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ X: XConstructor | None = None,
+ y: YConstructor | None = None,
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Inversely transform the label encoding.
This method only inversely transforms the target encoding.
@@ -964,17 +999,8 @@ def inverse_transform(
X: dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
- y: int, str, dict, sequence, dataframe-like or None, default=None
- Target column corresponding to `X`.
-
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe: Target columns for multioutput tasks.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
Returns
-------
@@ -985,38 +1011,43 @@ def inverse_transform(
Original target column. Only returned if provided.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=getattr(self, "feature_names_in_", None))
+ yt = to_tabular(y, index=getattr(Xt, "index", None))
+
self._log("Inversely cleaning the data...", 1)
- if y is not None and self._estimators:
- yt = y.__class__(dtype="object")
+ if yt is not None and self._estimators:
+ y_new = yt.__class__(dtype="object")
for col in self.target_names_in_:
if est := self._estimators.get(col):
if est.__class__.__name__ == "LabelEncoder":
self._log(f" --> Inversely label-encoding column {col}.", 2)
- out = est.inverse_transform(bk.DataFrame(y)[col])
+ out = est.inverse_transform(pd.DataFrame(yt)[col])
- elif isinstance(y, dataframe_t):
+ elif isinstance(yt, pd.DataFrame):
self._log(f" --> Inversely label-binarizing column {col}.", 2)
out = est.inverse_transform(
- y.loc[:, y.columns.str.startswith(f"{col}_")].to_numpy()
+ yt.loc[:, yt.columns.str.startswith(f"{col}_")].to_numpy()
)
# Replace encoded columns with target column
- if isinstance(y, series_t):
- yt = to_series(out, y.index, col)
+ if isinstance(yt, pd.Series):
+ y_new = to_series(out, yt.index, col)
else:
- yt = merge(yt, to_series(out, y.index, col))
+ y_new = merge(y_new, to_series(out, yt.index, col))
else: # Add unchanged column
- yt = merge(yt, bk.DataFrame(y)[col])
+ y_new = merge(y_new, pd.DataFrame(yt)[col])
- y = yt
+ yt = y_new
- return variable_return(X, y)
+ return variable_return(self._convert(Xt), self._convert(yt))
@beartype
-class Decomposer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
+class Decomposer(TransformerMixin, OneToOneFeatureMixin):
"""Detrend and deseasonalize the time series.
This class does two things:
@@ -1155,8 +1186,7 @@ def __init__(
self.sp = sp
self.seasonal_model = seasonal_model
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
+ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self:
"""Fit to data.
Parameters
@@ -1164,7 +1194,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, dict, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -1175,13 +1205,17 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
"""
from atom.models import MODELS
+ Xt = to_df(X)
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
if isinstance(self.model, str):
if self.model in MODELS:
model = MODELS[self.model](
goal=Goal.forecast,
**{x: getattr(self, x) for x in BaseTransformer.attrs if hasattr(self, x)},
)
- model.task = Goal.forecast.infer_task(y)
forecaster = model._get_est({})
else:
raise ValueError(
@@ -1203,7 +1237,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
self._log("Fitting Decomposer...", 1)
self._estimators: dict[Hashable, tuple[Transformer, Transformer]] = {}
- for name, column in X.select_dtypes(include="number").items():
+ for name, column in Xt.select_dtypes(include="number").items():
trend = Detrender(
forecaster=forecaster,
model=self.trend_model,
@@ -1224,8 +1258,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
return self
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Decompose the data.
Parameters
@@ -1233,7 +1266,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, dict, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -1242,15 +1275,18 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Transformed feature set.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Decomposing the data...", 1)
for col, (trend, season) in self._estimators.items():
- X[col] = season.transform(trend.transform(X[col]))
+ Xt[col] = season.transform(trend.transform(Xt[col]))
- return X
+ return self._convert(Xt)
- @composed(crash, method_to_log)
- def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Inversely transform the data.
Parameters
@@ -1258,7 +1294,7 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, dict, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -1267,16 +1303,20 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Original feature set.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Inversely decomposing the data...", 1)
for col, (trend, season) in self._estimators.items():
- X[col] = trend.inverse_transform(season.inverse_transform(X[col]))
+ Xt[col] = trend.inverse_transform(season.inverse_transform(Xt[col]))
- return X
+ return self._convert(Xt)
@beartype
-class Discretizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
+class Discretizer(TransformerMixin, OneToOneFeatureMixin):
"""Bin continuous data into intervals.
For each feature, the bin edges are computed during fit and,
@@ -1334,24 +1374,12 @@ class Discretizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: str, dict or None, default=None
- Execution engine to use for [data][data-acceleration] and
- [estimators][estimator-acceleration]. The value should be
- one of the possible values to change one of the two engines,
- or a dictionary with keys `data` and `estimator`, with their
- corresponding choice as values to change both engines. If
- None, the default values are used. Choose from:
-
- - "data":
-
- - "pandas" (default)
- - "pyarrow"
- - "modin"
+ engine: str or None, default=None
+ Execution engine to use for [estimators][estimator-acceleration].
+ If None, the default value is used. Choose from:
- - "estimator":
-
- - "sklearn" (default)
- - "cuml"
+ - "sklearn" (default)
+ - "cuml"
verbose: int, default=0
Verbosity level of the class. Choose from:
@@ -1444,8 +1472,7 @@ def __init__(
self.bins = bins
self.labels = labels
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
+ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self:
"""Fit to data.
Parameters
@@ -1453,7 +1480,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -1501,16 +1528,21 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
return labels
- self._estimators: dict[str, Estimator] = {}
- self._labels: dict[str, Sequence[str]] = {}
+ Xt = to_df(X)
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
+ self._estimators: dict[Hashable, Estimator] = {}
+ self._labels: dict[Hashable, Sequence[str]] = {}
self._log("Fitting Discretizer...", 1)
- for i, col in enumerate(X.select_dtypes(include="number")):
+ for i, col in enumerate(Xt.select_dtypes(include="number")):
# Assign bins per column
if isinstance(self.bins, dict):
if col in self.bins:
- bins_c = self.bins[col]
+ bins_c = self.bins[str(col)]
else:
continue # Ignore existing column not specified in dict
else:
@@ -1524,7 +1556,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
raise ValueError(
"Invalid value for the bins parameter. The length of the "
"bins does not match the length of the columns, got len"
- f"(bins)={len(bins_c)} and len(columns)={X.shape[1]}."
+ f"(bins)={len(bins_c)} and len(columns)={Xt.shape[1]}."
) from None
else:
bins_x = bins_c
@@ -1542,11 +1574,11 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
encode="ordinal",
strategy=self.strategy,
**kwargs,
- ).fit(X[[col]])
+ ).fit(Xt[[col]])
# Save labels for transform method
self._labels[col] = get_labels(
- col=col,
+ col=str(col),
bins=self._estimators[col].bin_edges_[0],
)
@@ -1566,14 +1598,13 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
# Make of cut a transformer
self._estimators[col] = FunctionTransformer(
- func=bk.cut,
- kw_args={"bins": bins_c, "labels": get_labels(col, bins_c)},
- ).fit(X[[col]])
+ func=pd.cut,
+ kw_args={"bins": bins_c, "labels": get_labels(str(col), bins_c)},
+ ).fit(Xt[[col]])
return self
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Bin the data into intervals.
Parameters
@@ -1581,7 +1612,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -1590,25 +1621,29 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Transformed feature set.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Binning the features...", 1)
for col in self._estimators:
if self.strategy == "custom":
- X[col] = self._estimators[col].transform(X[col])
+ Xt[col] = self._estimators[col].transform(Xt[col])
else:
- X[col] = self._estimators[col].transform(X[[col]]).iloc[:, 0]
+ Xt[col] = self._estimators[col].transform(Xt[[col]]).iloc[:, 0]
# Replace cluster values with labels
for i, label in enumerate(self._labels[col]):
- X[col] = X[col].replace(i, label)
+ Xt[col] = Xt[col].replace(i, label)
- self._log(f" --> Discretizing feature {col} in {X[col].nunique()} bins.", 2)
+ self._log(f" --> Discretizing feature {col} in {Xt[col].nunique()} bins.", 2)
- return X
+ return self._convert(Xt)
@beartype
-class Encoder(TransformerMixin, _SetOutputMixin):
+class Encoder(TransformerMixin):
"""Perform encoding of categorical features.
The encoding type depends on the number of classes in the column:
@@ -1761,8 +1796,7 @@ def __init__(
self.value = value
self.kwargs = kwargs
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
+ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self:
"""Fit to data.
Note that leaving y=None can lead to errors if the `strategy`
@@ -1774,17 +1808,8 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, dict, sequence or dataframe-like
- Target column corresponding to `X`.
-
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe: Target columns for multioutput tasks.
+ y: sequence or dataframe-like
+ Target column(s) corresponding to `X`.
Returns
-------
@@ -1811,6 +1836,12 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
"woe": WOEEncoder,
}
+ Xt = to_df(X)
+ yt = to_tabular(y, index=Xt.index)
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
if isinstance(self.strategy, str):
if self.strategy.lower().endswith("encoder"):
self.strategy = self.strategy[:-7] # Remove 'Encoder' at the end
@@ -1835,7 +1866,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
if self.infrequent_to_value:
if self.infrequent_to_value < 1:
- infrequent_to_value = int(self.infrequent_to_value * len(X))
+ infrequent_to_value = int(self.infrequent_to_value * len(Xt))
else:
infrequent_to_value = int(self.infrequent_to_value)
@@ -1843,12 +1874,12 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
encoders: dict[str, list[str]] = defaultdict(list)
- for name, column in X.select_dtypes(include=CAT_TYPES).items():
+ for name, column in Xt.select_dtypes(include=CAT_TYPES).items():
# Replace infrequent classes with the string in `value`
if self.infrequent_to_value:
values = column.value_counts()
self._to_value[name] = values[values <= infrequent_to_value].index.tolist()
- X[name] = column.replace(self._to_value[name], self.value)
+ Xt[name] = column.replace(self._to_value[name], self.value)
# Get the unique categories before fitting
self._categories[name] = column.dropna().sort_values().unique().tolist()
@@ -1862,8 +1893,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
self._log(
f" --> The number of classes passed to feature {name} in the "
f"ordinal parameter ({len(ordinal_c)}) don't match the number "
- f"of classes in the data ({column.nunique(dropna=True)}).",
- 1,
+ f"of classes in the data ({column.nunique(dropna=True)}).", 1,
severity="warning",
)
@@ -1908,7 +1938,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
remainder="passthrough",
n_jobs=self.n_jobs,
verbose_feature_names_out=False,
- ).fit(X, y)
+ ).fit(Xt, yt)
return self
@@ -1935,8 +1965,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) ->
return get_col_order(cols, self.feature_names_in_, self._estimator.feature_names_in_)
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Encode the data.
Parameters
@@ -1944,7 +1973,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -1953,10 +1982,14 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Encoded dataframe.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Encoding categorical columns...", 1)
# Convert infrequent classes to value
- X = X.replace(self._to_value, self.value)
+ Xt = Xt.replace(self._to_value, self.value)
for name, categories in self._categories.items():
if name in self._estimator.transformers_[0][2]:
@@ -1968,24 +2001,24 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log(
f" --> {estimator.__class__.__name__[:-7]}-encoding feature "
- f"{name}. Contains {X[name].nunique()} classes.", 2,
+ f"{name}. Contains {Xt[name].nunique()} classes.", 2,
)
# Count the propagated missing values
- if n_nans := X[name].isna().sum():
+ if n_nans := Xt[name].isna().sum():
self._log(f" --> Propagating {n_nans} missing values.", 2)
# Check for unknown classes
- if uc := len(X[name].dropna()[~X[name].isin(categories)]):
+ if uc := len(Xt[name].dropna()[~Xt[name].isin(categories)]):
self._log(f" --> Handling {uc} unknown classes.", 2)
- Xt = self._estimator.transform(X)
+ Xt = self._estimator.transform(Xt)
- return Xt[self.get_feature_names_out()]
+ return self._convert(Xt[self.get_feature_names_out()])
@beartype
-class Imputer(TransformerMixin, _SetOutputMixin):
+class Imputer(TransformerMixin):
"""Handle missing values in the data.
Impute or remove missing values according to the selected strategy.
@@ -1999,7 +2032,7 @@ class Imputer(TransformerMixin, _SetOutputMixin):
Parameters
----------
- strat_num: str, int or float, default="drop"
+ strat_num: str, int or float, default="mean"
Imputing strategy for numerical columns. Choose from:
- "drop": Drop rows containing missing values.
@@ -2019,7 +2052,7 @@ class Imputer(TransformerMixin, _SetOutputMixin):
of column.
- int or float: Impute with provided numerical value.
- strat_cat: str, default="drop"
+ strat_cat: str, default="most_frequent"
Imputing strategy for categorical columns. Choose from:
- "drop": Drop rows containing missing values.
@@ -2047,24 +2080,12 @@ class Imputer(TransformerMixin, _SetOutputMixin):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: str, dict or None, default=None
- Execution engine to use for [data][data-acceleration] and
- [estimators][estimator-acceleration]. The value should be
- one of the possible values to change one of the two engines,
- or a dictionary with keys `data` and `estimator`, with their
- corresponding choice as values to change both engines. If
- None, the default values are used. Choose from:
+ engine: str or None, default=None
+ Execution engine to use for [estimators][estimator-acceleration].
+ If None, the default value is used. Choose from:
- - "data":
-
- - "pandas" (default)
- - "pyarrow"
- - "modin"
-
- - "estimator":
-
- - "sklearn" (default)
- - "cuml"
+ - "sklearn" (default)
+ - "cuml"
verbose: int, default=0
Verbosity level of the class. Choose from:
@@ -2145,8 +2166,8 @@ class Imputer(TransformerMixin, _SetOutputMixin):
def __init__(
self,
- strat_num: Scalar | NumericalStrats = "drop",
- strat_cat: str | CategoricalStrats = "drop",
+ strat_num: Scalar | NumericalStrats = "mean",
+ strat_cat: str | CategoricalStrats = "most_frequent",
*,
max_nan_rows: FloatLargerZero | None = None,
max_nan_cols: FloatLargerZero | None = None,
@@ -2168,8 +2189,7 @@ def __init__(
self.max_nan_rows = max_nan_rows
self.max_nan_cols = max_nan_cols
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
+ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self:
"""Fit to data.
Parameters
@@ -2177,7 +2197,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -2186,22 +2206,27 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
Estimator instance.
"""
+ Xt = to_df(X)
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
if not hasattr(self, "missing_"):
self.missing_ = DEFAULT_MISSING
self._log("Fitting Imputer...", 1)
# Unify all values to impute
- X = replace_missing(X, self.missing_)
+ Xt = replace_missing(Xt, self.missing_)
if self.max_nan_rows is not None:
if self.max_nan_rows <= 1:
- self._max_nan_rows = int(X.shape[1] * self.max_nan_rows)
+ self._max_nan_rows = int(Xt.shape[1] * self.max_nan_rows)
else:
self._max_nan_rows = int(self.max_nan_rows)
- X = X.dropna(axis=0, thresh=X.shape[1] - self._max_nan_rows)
- if X.empty:
+ Xt = Xt.dropna(axis=0, thresh=Xt.shape[1] - self._max_nan_rows)
+ if Xt.empty:
raise ValueError(
"Invalid value for the max_nan_rows parameter, got "
f"{self.max_nan_rows}. All rows contain more than "
@@ -2211,11 +2236,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
if self.max_nan_cols is not None:
if self.max_nan_cols <= 1:
- max_nan_cols = int(X.shape[0] * self.max_nan_cols)
+ max_nan_cols = int(Xt.shape[0] * self.max_nan_cols)
else:
max_nan_cols = int(self.max_nan_cols)
- X = X.drop(columns=X.columns[X.isna().sum() > max_nan_cols])
+ Xt = Xt.drop(columns=Xt.columns[Xt.isna().sum() > max_nan_cols])
# Load the imputer class from sklearn or cuml (note the different modules)
SimpleImputer = self._get_est_class(
@@ -2235,7 +2260,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
elif self.strat_num == "drop":
num_imputer = "passthrough"
else:
- num_imputer = make_sklearn(sktimeImputer)(
+ num_imputer = make_sklearn(SktimeImputer)(
method=self.strat_num,
missing_values=[pd.NA],
random_state=self.random_state,
@@ -2263,13 +2288,13 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
self._estimator = ColumnTransformer(
transformers=[
- ("num_imputer", num_imputer, list(X.select_dtypes(include="number"))),
- ("cat_imputer", cat_imputer, list(X.select_dtypes(include=CAT_TYPES))),
+ ("num_imputer", num_imputer, list(Xt.select_dtypes(include="number"))),
+ ("cat_imputer", cat_imputer, list(Xt.select_dtypes(include=CAT_TYPES))),
],
remainder="passthrough",
n_jobs=self.n_jobs,
verbose_feature_names_out=False,
- ).fit(X)
+ ).fit(Xt)
return self
@@ -2295,12 +2320,11 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) ->
[c for c in self.feature_names_in_ if c in self._estimator.get_feature_names_out()]
)
- @composed(crash, method_to_log)
def transform(
self,
- X: DataFrame,
- y: Pandas | None = None,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ X: XConstructor,
+ y: YConstructor | None = None,
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Impute the missing values.
Note that leaving y=None can lead to inconsistencies in
@@ -2312,17 +2336,8 @@ def transform(
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, dict, sequence, dataframe-like or None, default=None
- Target column corresponding to `X`.
-
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe: Target columns for multioutput tasks.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
Returns
-------
@@ -2333,6 +2348,11 @@ def transform(
Transformed target column. Only returned if provided.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+ yt = to_tabular(y, index=Xt.index)
+
num_imputer = self._estimator.named_transformers_["num_imputer"]
cat_imputer = self._estimator.named_transformers_["cat_imputer"]
@@ -2341,52 +2361,49 @@ def transform(
self._log("Imputing missing values...", 1)
# Unify all values to impute
- X = replace_missing(X, self.missing_)
+ Xt = replace_missing(Xt, self.missing_)
# Drop rows with too many missing values
if self.max_nan_rows is not None:
- length = len(X)
- X = X.dropna(axis=0, thresh=X.shape[1] - self._max_nan_rows)
- if diff := length - len(X):
+ length = len(Xt)
+ Xt = Xt.dropna(axis=0, thresh=Xt.shape[1] - self._max_nan_rows)
+ if diff := length - len(Xt):
self._log(
f" --> Dropping {diff} samples for containing more "
- f"than {self._max_nan_rows} missing values.",
- 2,
+ f"than {self._max_nan_rows} missing values.", 2,
)
if self.strat_num == "drop":
- length = len(X)
- X = X.dropna(subset=self._estimator.transformers_[0][2])
- if diff := length - len(X):
+ length = len(Xt)
+ Xt = Xt.dropna(subset=self._estimator.transformers_[0][2])
+ if diff := length - len(Xt):
self._log(
f" --> Dropping {diff} samples for containing "
- f"missing values in numerical columns.",
- 2,
+ f"missing values in numerical columns.", 2,
)
if self.strat_cat == "drop":
- length = len(X)
- X = X.dropna(subset=self._estimator.transformers_[1][2])
- if diff := length - len(X):
+ length = len(Xt)
+ Xt = Xt.dropna(subset=self._estimator.transformers_[1][2])
+ if diff := length - len(Xt):
self._log(
f" --> Dropping {diff} samples for containing "
- f"missing values in categorical columns.",
- 2,
+ f"missing values in categorical columns.", 2,
)
# Print imputation information per feature
- for name, column in X.items():
+ for name, column in Xt.items():
if nans := column.isna().sum():
# Drop columns with too many missing values
if name not in self._estimator.feature_names_in_:
self._log(
f" --> Dropping feature {name}. Contains {nans} "
- f"({nans * 100 // len(X)}%) missing values.", 2,
+ f"({nans * 100 // len(Xt)}%) missing values.", 2,
)
- X = X.drop(columns=name)
+ Xt = Xt.drop(columns=name)
continue
- if self.strat_num != "drop" and name in num_imputer.feature_names_in_:
+ if name in getattr(num_imputer, "feature_names_in_", []):
if not isinstance(self.strat_num, str):
self._log(
f" --> Imputing {nans} missing values with "
@@ -2400,15 +2417,14 @@ def transform(
elif self.strat_num in ("mean", "median", "most_frequent"):
self._log(
f" --> Imputing {nans} missing values with {self.strat_num} "
- f"({np.round(get_stat(num_imputer, name), 2)}) in column "
- f"{name}.", 2,
+ f"({np.round(get_stat(num_imputer, name), 2)}) in column {name}.", 2,
)
else:
self._log(
f" --> Imputing {nans} missing values with {self.strat_num} "
f"in column {name}.", 2,
)
- elif self.strat_cat != "drop" and name in cat_imputer.feature_names_in_:
+ elif name in getattr(cat_imputer, "feature_names_in_", []):
if self.strat_cat == "most_frequent":
self._log(
f" --> Imputing {nans} missing values with most_frequent "
@@ -2420,20 +2436,20 @@ def transform(
f"'{self.strat_cat}' in column {name}.", 2,
)
- Xt = self._estimator.transform(X)
+ Xt = self._estimator.transform(Xt)
# Make y consistent with X
- if y is not None:
- y = y[y.index.isin(Xt.index)]
+ if yt is not None:
+ yt = yt[yt.index.isin(Xt.index)]
# Reorder columns to original order
Xt = Xt[self.get_feature_names_out()]
- return variable_return(Xt, y)
+ return variable_return(self._convert(Xt), self._convert(yt))
@beartype
-class Normalizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
+class Normalizer(TransformerMixin, OneToOneFeatureMixin):
"""Transform the data to follow a Normal/Gaussian distribution.
This transformation is useful for modeling issues related to
@@ -2470,24 +2486,12 @@ class Normalizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: str, dict or None, default=None
- Execution engine to use for [data][data-acceleration] and
- [estimators][estimator-acceleration]. The value should be
- one of the possible values to change one of the two engines,
- or a dictionary with keys `data` and `estimator`, with their
- corresponding choice as values to change both engines. If
- None, the default values are used. Choose from:
-
- - "data":
-
- - "pandas" (default)
- - "pyarrow"
- - "modin"
+ engine: str or None, default=None
+ Execution engine to use for [estimators][estimator-acceleration].
+ If None, the default value is used. Choose from:
- - "estimator":
-
- - "sklearn" (default)
- - "cuml"
+ - "sklearn" (default)
+ - "cuml"
verbose: int, default=0
Verbosity level of the class. Choose from:
@@ -2575,8 +2579,7 @@ def __init__(
self.strategy = strategy
self.kwargs = kwargs
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
+ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self:
"""Fit to data.
Parameters
@@ -2584,7 +2587,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -2599,6 +2602,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
"quantile": "QuantileTransformer",
}
+ Xt = to_df(X)
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
if self.strategy in ("yeojohnson", "boxcox"):
estimator = self._get_est_class(strategies[self.strategy], "preprocessing")
self._estimator = estimator(
@@ -2619,7 +2627,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
f"Choose from: {', '.join(strategies)}."
)
- num_cols = X.select_dtypes(include="number")
+ num_cols = Xt.select_dtypes(include="number")
if num_cols.empty:
raise ValueError(
@@ -2635,8 +2643,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
return self
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Apply the transformations to the data.
Parameters
@@ -2644,7 +2651,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -2653,15 +2660,17 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Normalized dataframe.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Normalizing features...", 1)
- Xt = self._estimator.transform(X[self._estimator.feature_names_in_])
- X.update(Xt)
+ Xt.update(self._estimator.transform(Xt[self._estimator.feature_names_in_]))
- return X[self.feature_names_in_]
+ return self._convert(Xt)
- @composed(crash, method_to_log)
- def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Apply the inverse transformation to the data.
Parameters
@@ -2669,7 +2678,7 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -2678,17 +2687,21 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Original dataframe.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Inversely normalizing features...", 1)
- Xt = self._estimator.inverse_transform(X[self._estimator.feature_names_in_])
- Xt = to_df(Xt, index=X.index, columns=self._estimator.feature_names_in_)
- X.update(Xt)
+ out: np.ndarray = self._estimator.inverse_transform(Xt[self._estimator.feature_names_in_])
+
+ Xt.update(to_df(out, index=Xt.index, columns=self._estimator.feature_names_in_))
- return X
+ return self._convert(Xt)
@beartype
-class Pruner(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
+class Pruner(TransformerMixin, OneToOneFeatureMixin):
"""Prune outliers from the data.
Replace or remove outliers. The definition of outlier depends
@@ -2743,25 +2756,12 @@ class Pruner(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: str, dict or None, default=None
- Execution engine to use for [data][data-acceleration] and
- [estimators][estimator-acceleration]. The value should be
- one of the possible values to change one of the two engines,
- or a dictionary with keys `data` and `estimator`, with their
- corresponding choice as values to change both engines. If
- None, the default values are used. Choose from:
-
- - "data":
-
- - "pandas" (default)
- - "pyarrow"
- - "modin"
+ engine: str or None, default=None
+ Execution engine to use for [estimators][estimator-acceleration].
+ If None, the default value is used. Choose from:
- - "estimator":
-
- - "sklearn" (default)
- - "sklearnex"
- - "cuml"
+ - "sklearn" (default)
+ - "cuml"
verbose: int, default=0
Verbosity level of the class. Choose from:
@@ -2850,12 +2850,11 @@ def __init__(
self.include_target = include_target
self.kwargs = kwargs
- @composed(crash, method_to_log)
def transform(
self,
- X: DataFrame,
- y: Pandas | None = None,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ X: XConstructor,
+ y: YConstructor | None = None,
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Apply the outlier strategy on the data.
Parameters
@@ -2863,17 +2862,8 @@ def transform(
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, dict, sequence, dataframe-like or None, default=None
- Target column corresponding to `X`.
-
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe: Target columns for multioutput tasks.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
Returns
-------
@@ -2884,6 +2874,9 @@ def transform(
Transformed target column. Only returned if provided.
"""
+ Xt = to_df(X, columns=getattr(self, "feature_names_in_", None))
+ yt = to_tabular(y, index=Xt.index)
+
# Estimators with their modules
strategies = {
"iforest": ["IsolationForest", "ensemble"],
@@ -2922,7 +2915,7 @@ def transform(
self._log("Pruning outliers...", 1)
# Prepare dataset (merge with y and exclude categorical columns)
- objective = merge(X, y) if self.include_target and y is not None else X
+ objective = merge(Xt, yt) if self.include_target and yt is not None else Xt
objective = objective.select_dtypes(include=["number"])
outliers = []
@@ -2984,27 +2977,27 @@ def transform(
if outliers:
# Select outliers from intersection of strategies
- mask = [any(strats) for strats in zip(*outliers, strict=True)]
- self._log(f" --> Dropping {len(mask) - sum(mask)} outliers.", 2)
+ outlier_rows = [any(strats) for strats in zip(*outliers, strict=True)]
+ self._log(f" --> Dropping {len(outlier_rows) - sum(outlier_rows)} outliers.", 2)
# Keep only the non-outliers from the data
- X = X[mask]
- if y is not None:
- y = y[mask]
+ Xt = Xt[outlier_rows]
+ if yt is not None:
+ yt = yt[outlier_rows]
else:
# Replace the columns in X and y with the new values from objective
- X.update(objective)
- if isinstance(y, series_t) and y.name in objective:
- y.update(objective[str(y.name)])
- elif isinstance(y, dataframe_t):
- y.update(objective)
+ Xt.update(objective)
+ if isinstance(yt, pd.Series) and yt.name in objective:
+ yt.update(objective[str(yt.name)])
+ elif isinstance(yt, pd.DataFrame):
+ yt.update(objective)
- return variable_return(X, y)
+ return variable_return(self._convert(Xt), self._convert(yt))
@beartype
-class Scaler(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
+class Scaler(TransformerMixin, OneToOneFeatureMixin):
"""Scale the data.
Apply one of sklearn's scaling strategies. Categorical columns
@@ -3033,24 +3026,12 @@ class Scaler(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: str, dict or None, default=None
- Execution engine to use for [data][data-acceleration] and
- [estimators][estimator-acceleration]. The value should be
- one of the possible values to change one of the two engines,
- or a dictionary with keys `data` and `estimator`, with their
- corresponding choice as values to change both engines. If
- None, the default values are used. Choose from:
+ engine: str or None, default=None
+ Execution engine to use for [estimators][estimator-acceleration].
+ If None, the default value is used. Choose from:
- - "data":
-
- - "pandas" (default)
- - "pyarrow"
- - "modin"
-
- - "estimator":
-
- - "sklearn" (default)
- - "cuml"
+ - "sklearn" (default)
+ - "cuml"
verbose: int, default=0
Verbosity level of the class. Choose from:
@@ -3128,8 +3109,7 @@ def __init__(
self.include_binary = include_binary
self.kwargs = kwargs
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
+ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self:
"""Fit to data.
Parameters
@@ -3137,7 +3117,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -3153,10 +3133,15 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
"robust": "RobustScaler",
}
- num_cols = X.select_dtypes(include="number")
+ Xt = to_df(X)
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
+ num_cols = Xt.select_dtypes(include="number")
if not self.include_binary:
- num_cols = X[[n for n, c in num_cols.items() if ~np.isin(c.unique(), [0, 1]).all()]]
+ num_cols = Xt[[n for n, c in num_cols.items() if ~np.isin(c.unique(), [0, 1]).all()]]
if num_cols.empty:
raise ValueError(
@@ -3165,19 +3150,17 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
"non-binary columns when include_binary=False."
)
- estimator = self._get_est_class(strategies[self.strategy], "preprocessing")
- self._estimator = estimator(**self.kwargs)
-
self._log("Fitting Scaler...", 1)
- self._estimator.fit(num_cols)
+
+ estimator = self._get_est_class(strategies[self.strategy], "preprocessing")
+ self._estimator = estimator(**self.kwargs).fit(num_cols)
# Add the estimator as attribute to the instance
setattr(self, f"{self.strategy}_", self._estimator)
return self
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Perform standardization by centering and scaling.
Parameters
@@ -3185,7 +3168,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -3194,15 +3177,17 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Scaled dataframe.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Scaling features...", 1)
- Xt = self._estimator.transform(X[self._estimator.feature_names_in_])
- X.update(Xt)
+ Xt.update(self._estimator.transform(Xt[self._estimator.feature_names_in_]))
- return X
+ return self._convert(Xt)
- @composed(crash, method_to_log)
- def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Apply the inverse transformation to the data.
Parameters
@@ -3210,7 +3195,7 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -3219,10 +3204,14 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Scaled dataframe.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Inversely scaling features...", 1)
- Xt = self._estimator.inverse_transform(X[self._estimator.feature_names_in_])
- Xt = to_df(Xt, index=X.index, columns=self._estimator.feature_names_in_)
- X.update(Xt)
+ out: np.ndarray = self._estimator.inverse_transform(Xt[self._estimator.feature_names_in_])
+
+ Xt.update(to_df(out, index=Xt.index, columns=self._estimator.feature_names_in_))
- return X
+ return self._convert(Xt)
diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py
index 16e745f84..430d7fe71 100644
--- a/atom/feature_engineering.py
+++ b/atom/feature_engineering.py
@@ -9,10 +9,9 @@
from collections.abc import Hashable
from random import sample
-from typing import Any, Literal
+from typing import Any, Literal, cast
import featuretools as ft
-import joblib
import numpy as np
import pandas as pd
from beartype import beartype
@@ -34,14 +33,14 @@
from atom.basetransformer import BaseTransformer
from atom.data_cleaning import Scaler, TransformerMixin
from atom.utils.types import (
- Backend, Bool, DataFrame, Engine, FeatureSelectionSolvers,
- FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero,
- FloatZeroToOneInc, IntLargerEqualZero, IntLargerZero, NJobs, Operators,
- Pandas, Scalar, Sequence, Series, Verbose, series_t,
+ Bool, Engine, FeatureSelectionSolvers, FeatureSelectionStrats,
+ FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc,
+ IntLargerEqualZero, IntLargerZero, NJobs, Operators, Scalar, Sequence,
+ Verbose, XConstructor, XReturn, YConstructor,
)
from atom.utils.utils import (
- Goal, Task, bk, check_is_fitted, check_scaling, composed, crash,
- get_custom_scorer, is_sparse, lst, merge, method_to_log, sign,
+ Goal, Task, check_is_fitted, check_scaling, get_custom_scorer, is_sparse,
+ lst, merge, sign, to_df, to_tabular,
)
@@ -173,8 +172,7 @@ def __init__(
self.drop_columns = drop_columns
self.from_index = from_index
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Extract the new features.
Parameters
@@ -182,7 +180,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -191,24 +189,26 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Transformed feature set.
"""
+ Xt = to_df(X, columns=getattr(self, "feature_names_in_", None))
+
self._log("Extracting datetime features...", 1)
if self.from_index:
- if hasattr(X.index, "to_timestamp"):
- Xc = bk.DataFrame(X.index.to_timestamp())
- order = Xc.columns.tolist() + X.columns.tolist()
+ if hasattr(Xt.index, "to_timestamp"):
+ Xc = pd.DataFrame(Xt.index.to_timestamp())
+ order = Xc.columns.tolist() + Xt.columns.tolist()
else:
raise ValueError("Unable to convert the index to a timestamp format.")
else:
- Xc = X.select_dtypes(exclude="number")
- order = X.columns.tolist()
+ Xc = Xt.select_dtypes(exclude="number")
+ order = Xt.columns.tolist()
- Xt = bk.DataFrame(index=X.index)
+ X_new = pd.DataFrame(index=Xt.index)
for name, column in Xc.items():
col_dt = pd.to_datetime(
arg=column,
errors="coerce", # Converts to NaT if he can't format
- format=self.fmt.get(name) if isinstance(self.fmt, dict) else self.fmt,
+ format=self.fmt.get(str(name)) if isinstance(self.fmt, dict) else self.fmt,
)
# If >30% values are NaT, the conversion was unsuccessful
@@ -228,7 +228,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
f"{fx.lower()} is not an attribute of pd.Series.dt."
)
- if not isinstance(series, series_t):
+ if not isinstance(series, pd.Series):
self._log(
f" --> Extracting feature {fx} "
"failed. Result is not a Series.dt.", 2,
@@ -238,7 +238,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
continue # Skip if the resulting feature has zero variance
min_val: int = 0
- max_val: Scalar | Series | None = None # None if isn't cyclic
+ max_val: Scalar | pd.Series | None = None # None if isn't cyclic
if self.encoding_type == "cyclic":
if fx == "microsecond":
min_val, max_val = 0, 1e6 - 1
@@ -252,7 +252,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
min_val, max_val = 1, col_dt.dt.daysinmonth
elif fx in ("dayofyear", "day_of_year"):
min_val = 1
- max_val = [365 if i else 366 for i in col_dt.dt.is_leap_year]
+ max_val = pd.Series([365 if i else 366 for i in col_dt.dt.is_leap_year])
elif fx == "month":
min_val, max_val = 1, 12
elif fx == "quarter":
@@ -261,21 +261,21 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
new_name = f"{name}_{fx}"
if self.encoding_type == "ordinal" or max_val is None:
self._log(f" --> Creating feature {new_name}.", 2)
- Xt[new_name] = series.to_numpy()
- order.insert(order.index(name) + 1, new_name)
+ X_new[new_name] = series.to_numpy()
+ order.insert(order.index(str(name)) + 1, new_name)
elif self.encoding_type == "cyclic":
self._log(f" --> Creating cyclic feature {new_name}.", 2)
pos = 2 * np.pi * (series.to_numpy() - min_val) / np.array(max_val)
- Xt[f"{new_name}_sin"] = np.sin(pos)
- Xt[f"{new_name}_cos"] = np.cos(pos)
- order.insert(order.index(name) + 1, f"{new_name}_sin")
- order.insert(order.index(name) + 2, f"{new_name}_cos")
+ X_new[f"{new_name}_sin"] = np.sin(pos)
+ X_new[f"{new_name}_cos"] = np.cos(pos)
+ order.insert(order.index(str(name)) + 1, f"{new_name}_sin")
+ order.insert(order.index(str(name)) + 2, f"{new_name}_cos")
# Drop the original column
if self.drop_columns or self.from_index:
- order.remove(name)
+ order.remove(str(name))
- return merge(Xt, X)[order]
+ return self._convert(merge(X_new, Xt)[order])
@beartype
@@ -420,8 +420,7 @@ def __init__(
self.operators = operators
self.kwargs = kwargs
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
+ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self:
"""Fit to data.
Parameters
@@ -429,18 +428,8 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
- Target column corresponding to `X`.
-
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe-like: Target columns with shape=(n_samples,
- n_targets) for multioutput tasks.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
Returns
-------
@@ -448,6 +437,12 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
Estimator instance.
"""
+ Xt = to_df(X)
+ yt = to_tabular(y, index=Xt.index)
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
all_operators = {
"add": "add_numeric",
"sub": "subtract_numeric",
@@ -470,7 +465,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
if self.strategy == "dfs":
# Run deep feature synthesis with transformation primitives
- es = ft.EntitySet(dataframes={"X": (X, "_index", None, None, None, True)})
+ es = ft.EntitySet(dataframes={"X": (Xt, "_index", None, None, None, True)})
self._dfs = ft.dfs(
target_dataframe_name="X",
entityset=es,
@@ -481,7 +476,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
)
# Select the new features (dfs also returns originals)
- self._dfs = self._dfs[X.shape[1] - 1:]
+ self._dfs = self._dfs[Xt.shape[1] - 1:]
# Get a random selection of features
if self.n_features and self.n_features < len(self._dfs):
@@ -500,17 +495,16 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
init_depth=kwargs.pop("init_depth", (1, 2)),
const_range=kwargs.pop("const_range", None),
function_set=operators,
- feature_names=X.columns,
+ feature_names=Xt.columns,
verbose=kwargs.pop("verbose", 0 if self.verbose < 2 else 1),
n_jobs=kwargs.pop("n_jobs", self.n_jobs),
random_state=kwargs.pop("random_state", self.random_state),
**kwargs,
- ).fit(X, y)
+ ).fit(Xt, yt)
return self
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Generate new features.
Parameters
@@ -518,7 +512,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -527,18 +521,18 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Transformed feature set.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Generating new features...", 1)
if self.strategy == "dfs":
- es = ft.EntitySet(dataframes={"X": (X, "index", None, None, None, True)})
- dfs = ft.calculate_feature_matrix(
- features=self._dfs,
- entityset=es,
- n_jobs=self.n_jobs,
- )
+ es = ft.EntitySet(dataframes={"X": (Xt, "index", None, None, None, True)})
+ dfs = ft.calculate_feature_matrix(self._dfs, entityset=es, n_jobs=self.n_jobs)
# Add the new features to the feature set
- X = pd.concat([X, dfs], axis=1).set_index("index")
+ Xt = pd.concat([Xt, dfs], axis=1).set_index("index")
self._log(f" --> {len(self._dfs)} new features were added.", 2)
@@ -548,7 +542,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
data=[
["", str(fx), fx.fitness_]
for i, fx in enumerate(self.gfg_)
- if str(fx) not in X.columns
+ if str(fx) not in Xt.columns
],
columns=["name", "description", "fitness"],
)
@@ -556,7 +550,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
# Check if any new features remain
if len(df) == 0:
self._log(" --> The genetic algorithm didn't find any improving features.", 2)
- return X
+ return Xt
# Select the n_features with the highest fitness
df = df.drop_duplicates()
@@ -566,17 +560,16 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
if len(df) != self.n_features:
self._log(
f" --> Dropping {(self.n_features or len(self.gfg_)) - len(df)} "
- "features due to repetition.",
- 2,
+ "features due to repetition.", 2,
)
- for i, array in enumerate(self.gfg_.transform(X)[:, df.index].T):
+ for i, array in enumerate(self.gfg_.transform(Xt)[:, df.index].T):
# If the column is new, use a default name
counter = 0
while True:
- name = f"x{X.shape[1] + counter}"
- if name not in X:
- X[name] = array # Add new feature to X
+ name = f"x{Xt.shape[1] + counter}"
+ if name not in Xt:
+ Xt[name] = array # Add new feature to X
df.iloc[i, 0] = name
break
else:
@@ -585,7 +578,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log(f" --> {len(df)} new features were added.", 2)
self.genetic_features_ = df.reset_index(drop=True)
- return X
+ return self._convert(Xt)
@beartype
@@ -681,8 +674,7 @@ def __init__(
self.operators = operators
self.drop_columns = drop_columns
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Group features.
Parameters
@@ -690,7 +682,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -699,6 +691,8 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Transformed feature set.
"""
+ Xt = to_df(X, columns=getattr(self, "feature_names_in_", None))
+
self._log("Grouping features...", 1)
if self.operators is None:
@@ -710,10 +704,10 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
for name, group in self.groups.items():
for operator in operators:
try:
- result = X[group].apply(getattr(np, operator), axis=1)
+ result = Xt[group].apply(getattr(np, operator), axis=1)
except AttributeError:
try:
- result = getattr(stats, operator)(X[group], axis=1)[0]
+ result = getattr(stats, operator)(Xt[group], axis=1)[0]
except AttributeError:
raise ValueError(
"Invalid value for the operators parameter. Value "
@@ -721,7 +715,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
) from None
try:
- X[f"{operator}({name})"] = result
+ Xt[f"{operator}({name})"] = result
except ValueError:
raise ValueError(
"Invalid value for the operators parameter. Value "
@@ -732,9 +726,9 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log(f" --> Group {name} successfully created.", 2)
if self.drop_columns:
- X = X.drop(columns=to_drop)
+ Xt = Xt.drop(columns=to_drop)
- return X
+ return self._convert(Xt)
@beartype
@@ -901,35 +895,12 @@ class FeatureSelector(TransformerMixin):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: str, dict or None, default=None
- Execution engine to use for [data][data-acceleration] and
- [estimators][estimator-acceleration]. The value should be
- one of the possible values to change one of the two engines,
- or a dictionary with keys `data` and `estimator`, with their
- corresponding choice as values to change both engines. If
- None, the default values are used. Choose from:
-
- - "data":
-
- - "pandas" (default)
- - "pyarrow"
- - "modin"
-
- - "estimator":
+ engine: str or None, default=None
+ Execution engine to use for [estimators][estimator-acceleration].
+ If None, the default value is used. Choose from:
- - "sklearn" (default)
- - "sklearnex"
- - "cuml"
-
- backend: str, default="loky"
- Parallelization backend. Read more in the
- [user guide][parallel-execution]. Choose from:
-
- - "loky": Single-node, process-based parallelism.
- - "multiprocessing": Legacy single-node, process-based
- parallelism. Less robust than `loky`.
- - "threading": Single-node, thread-based parallelism.
- - "ray": Multi-node, process-based parallelism.
+ - "sklearn" (default)
+ - "cuml"
verbose: int, default=0
Verbosity level of the class. Choose from:
@@ -1015,7 +986,6 @@ def __init__(
n_jobs: NJobs = 1,
device: str = "cpu",
engine: Engine = None,
- backend: Backend = "loky",
verbose: Verbose = 0,
random_state: IntLargerEqualZero | None = None,
**kwargs,
@@ -1024,7 +994,6 @@ def __init__(
n_jobs=n_jobs,
device=device,
engine=engine,
- backend=backend,
verbose=verbose,
random_state=random_state,
)
@@ -1036,8 +1005,7 @@ def __init__(
self.max_correlation = max_correlation
self.kwargs = kwargs
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
+ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self:
"""Fit the feature selector to the data.
The univariate, sfm (when model is not fitted), sfs, rfe and
@@ -1049,18 +1017,8 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
- Target column corresponding to `X`.
-
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If dict: Name of the target column and sequence of values.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput
- tasks.
- - If dataframe-like: Target columns with shape=(n_samples,
- n_targets) for multioutput tasks.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
Returns
-------
@@ -1070,14 +1028,6 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
"""
from atom.models import MODELS
- def check_y():
- """For some strategies, y needs to be provided."""
- if y is None:
- raise ValueError(
- "Invalid value for the y parameter. Value cannot "
- f"be None for strategy='{self.strategy}'."
- )
-
def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
"""Objective function for the advanced optimization strategies."""
if X_train.equals(X_valid):
@@ -1087,6 +1037,18 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
model.fit(X_train, y_train)
return scoring(model, X_valid, y_valid)
+ Xt = to_df(X)
+ yt = to_tabular(y, index=Xt.index)
+
+ if yt is None and self.strategy not in ("pca", "sfm", None):
+ raise ValueError(
+ "Invalid value for the y parameter. Value cannot "
+ f"be None for strategy='{self.strategy}'."
+ )
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
+
self.collinear_ = pd.DataFrame(columns=["drop", "corr_feature", "corr_value"])
self.scaler_ = None
@@ -1094,21 +1056,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
self._high_variance: dict[Hashable, tuple[Hashable, int]] = {}
self._low_variance: dict[Hashable, tuple[Hashable, float]] = {}
self._estimator: Any = None
- self._n_features = None
-
- strategies = {
- "univariate": "SelectKBest",
- "pca": "PCA",
- "sfm": "SelectFromModel",
- "sfs": "SequentialFeatureSelector",
- "rfe": "RFE",
- "rfecv": "RFECV",
- "pso": ParticleSwarmOptimization,
- "hho": HarrisHawkOptimization,
- "gwo": GreyWolfOptimization,
- "dfo": DragonFlyOptimization,
- "go": GeneticOptimization,
- }
+ self._n_features: int | None = None
if isinstance(self.strategy, str):
if self.strategy not in ("univariate", "pca"):
@@ -1144,7 +1092,8 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
if hasattr(self, x)
},
)
- model.task = goal.infer_task(y)
+ if yt is not None:
+ model.task = goal.infer_task(yt)
solver = model._get_est({})
else:
raise ValueError(
@@ -1171,25 +1120,25 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
)
if self.n_features is None:
- self._n_features = X.shape[1]
+ self._n_features = Xt.shape[1]
elif self.n_features < 1:
- self._n_features = int(self.n_features * X.shape[1])
+ self._n_features = int(self.n_features * Xt.shape[1])
else:
- self._n_features = self.n_features
+ self._n_features = int(self.n_features)
min_repeated: Scalar
if self.min_repeated is None:
min_repeated = 1
elif self.min_repeated <= 1:
- min_repeated = self.min_repeated * len(X)
+ min_repeated = self.min_repeated * len(Xt)
else:
min_repeated = int(self.min_repeated)
max_repeated: Scalar
if self.max_repeated is None:
- max_repeated = len(X)
+ max_repeated = len(Xt)
elif self.max_repeated <= 1:
- max_repeated = self.max_repeated * len(X)
+ max_repeated = self.max_repeated * len(Xt)
else:
max_repeated = int(self.max_repeated)
@@ -1203,30 +1152,30 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
# Remove features with too high variance
if self.min_repeated is not None:
- for name, column in X.select_dtypes(exclude="number").items():
+ for name, column in Xt.select_dtypes(exclude="number").items():
max_counts = column.value_counts()
if min_repeated > max_counts.max():
self._high_variance[name] = (max_counts.idxmax(), max_counts.max())
- X = X.drop(columns=name)
+ Xt = Xt.drop(columns=name)
break
# Remove features with too low variance
if self.max_repeated is not None:
- for name, column in X.select_dtypes(exclude="number").items():
+ for name, column in Xt.select_dtypes(exclude="number").items():
for category, count in column.value_counts().items():
if count >= max_repeated:
- self._low_variance[name] = (category, 100.0 * count / len(X))
- X = X.drop(columns=name)
+ self._low_variance[name] = (category, 100.0 * count / len(Xt))
+ Xt = Xt.drop(columns=name)
break
# Remove features with too high correlation
self.collinear = pd.DataFrame(columns=["drop", "corr_feature", "corr_value"])
if self.max_correlation:
# Get the Pearson correlation coefficient matrix
- if y is None:
- corr_X = X.corr()
+ if yt is None:
+ corr_X = Xt.corr()
else:
- corr_matrix = merge(X, y).corr()
+ corr_matrix = merge(Xt, yt).corr()
corr_X, corr_y = corr_matrix.iloc[:-1, :-1], corr_matrix.iloc[:-1, -1]
corr = {}
@@ -1237,7 +1186,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
# Always finds himself with correlation 1
if len(corr[col]) > 1:
- if y is None:
+ if yt is None:
# Drop all but the first one
to_drop.extend(list(corr[col][1:].index))
else:
@@ -1262,7 +1211,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
ignore_index=True,
)
- X = X.drop(columns=self.collinear_["drop"].tolist())
+ Xt = Xt.drop(columns=self.collinear_["drop"].tolist())
if self.strategy is None:
return self # Exit feature_engineering
@@ -1292,15 +1241,14 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
else:
solver = self.solver
- check_y()
- self._estimator = SelectKBest(solver, k=self._n_features).fit(X, y)
+ self._estimator = SelectKBest(solver, k=self._n_features).fit(Xt, yt)
elif self.strategy == "pca":
- if not is_sparse(X):
+ if not is_sparse(Xt):
# PCA requires the features to be scaled
- if not check_scaling(X):
- self.scaler_ = Scaler()
- X = self.scaler_.fit_transform(X)
+ if not check_scaling(Xt):
+ self.scaler_ = Scaler(device=self.device, engine=self.engine)
+ Xt = cast(pd.DataFrame, self.scaler_.fit_transform(Xt))
estimator = self._get_est_class("PCA", "decomposition")
solver_param = "svd_solver"
@@ -1316,11 +1264,11 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
# The PCA and TruncatedSVD both get all possible components to use
# for the plots (n_components must be < n_features and <= n_rows)
self._estimator = estimator(
- n_components=min(len(X), X.shape[1] - 1),
+ n_components=min(len(Xt), Xt.shape[1] - 1),
**{solver_param: solver},
random_state=self.random_state,
**self.kwargs,
- ).fit(X)
+ ).fit(Xt)
self._estimator._comps = min(self._estimator.components_.shape[0], self._n_features)
@@ -1342,7 +1290,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
**kwargs,
)
if prefit:
- if list(getattr(solver, "feature_names_in_", [])) != list(X.columns):
+ if list(getattr(solver, "feature_names_in_", [])) != list(Xt.columns):
raise ValueError(
"Invalid value for the solver parameter. The "
f"{solver.__class__.__name__} estimator "
@@ -1350,13 +1298,10 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
)
self._estimator.estimator_ = solver
else:
- check_y()
- self._estimator.fit(X, y)
+ self._estimator.fit(Xt, yt)
elif self.strategy in ("sfs", "rfe", "rfecv"):
if self.strategy == "sfs":
- check_y()
-
if self.kwargs.get("scoring"):
kwargs["scoring"] = get_custom_scorer(self.kwargs["scoring"])
@@ -1368,8 +1313,6 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
)
elif self.strategy == "rfe":
- check_y()
-
self._estimator = RFE(
estimator=solver,
n_features_to_select=self._n_features,
@@ -1377,13 +1320,11 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
)
elif self.strategy == "rfecv":
- check_y()
-
if self.kwargs.get("scoring"):
kwargs["scoring"] = get_custom_scorer(self.kwargs["scoring"])
# Invert n_features to select them all (default option)
- if self._n_features == X.shape[1]:
+ if self._n_features == Xt.shape[1]:
self._n_features = 1
self._estimator = RFECV(
@@ -1393,11 +1334,16 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
**kwargs,
)
- with joblib.parallel_backend(backend=self.backend):
- self._estimator.fit(X, y)
+ self._estimator.fit(Xt, yt)
else:
- check_y()
+ strategies = {
+ "pso": ParticleSwarmOptimization,
+ "hho": HarrisHawkOptimization,
+ "gwo": GreyWolfOptimization,
+ "dfo": DragonFlyOptimization,
+ "go": GeneticOptimization,
+ }
# Either use a provided validation set or cross-validation over X
if "X_valid" in kwargs:
@@ -1411,7 +1357,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
"cannot be absent when X_valid is provided."
)
else:
- X_valid, y_valid = X, y
+ X_valid, y_valid = Xt, yt
# Get scoring for default objective_function
if "objective_function" not in kwargs:
@@ -1419,7 +1365,8 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
kwargs["scoring"] = get_custom_scorer(kwargs["scoring"])
else:
goal = Goal(0) if is_classifier(solver) else Goal(1)
- task = goal.infer_task(y)
+ if yt is not None:
+ task = goal.infer_task(yt)
if task is Task.binary_classification:
kwargs["scoring"] = get_custom_scorer("f1")
elif task.is_multiclass:
@@ -1435,8 +1382,8 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring):
self._estimator.fit(
model=solver,
- X_train=X,
- y_train=y,
+ X_train=Xt,
+ y_train=yt,
X_valid=X_valid,
y_valid=y_valid,
verbose=self.verbose >= 2,
@@ -1491,8 +1438,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) ->
]
)
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Transform the data.
Parameters
@@ -1500,7 +1446,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
X: dataframe-like
Feature set with shape=(n_samples, n_features).
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -1509,6 +1455,10 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Transformed feature set.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Performing feature selection ...", 1)
# Remove features with too high variance
@@ -1516,9 +1466,9 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
self._log(
f" --> Feature {fx} was removed due to high variance. "
f"Value {h_variance[0]} was the most repeated value with "
- f"{h_variance[1]} ({h_variance[1] / len(X):.1f}%) occurrences.", 2,
+ f"{h_variance[1]} ({h_variance[1] / len(Xt):.1f}%) occurrences.", 2,
)
- X = X.drop(columns=fx)
+ Xt = Xt.drop(columns=fx)
# Remove features with too low variance
for fx, l_variance in self._low_variance.items():
@@ -1526,7 +1476,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
f" --> Feature {fx} was removed due to low variance. Value "
f"{l_variance[0]} repeated in {l_variance[1]:.1f}% of the rows.", 2,
)
- X = X.drop(columns=fx)
+ Xt = Xt.drop(columns=fx)
# Remove features with too high correlation
for col in self.collinear_["drop"]:
@@ -1534,34 +1484,34 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
f" --> Feature {col} was removed due to "
"collinearity with another feature.", 2,
)
- X = X.drop(columns=col)
+ Xt = Xt.drop(columns=col)
# Perform selection based on strategy
if self.strategy is None:
- return X
+ return self._convert(Xt)
elif self.strategy == "univariate":
self._log(
f" --> The univariate test selected "
f"{self._n_features} features from the dataset.", 2,
)
- for n, column in enumerate(X):
+ for n, column in enumerate(Xt):
if not self.univariate_.get_support()[n]:
self._log(
f" --> Dropping feature {column} "
f"(score: {self.univariate_.scores_[n]:.2f} "
f"p-value: {self.univariate_.pvalues_[n]:.2f}).", 2,
)
- X = X.drop(columns=column)
+ Xt = Xt.drop(columns=column)
elif self.strategy == "pca":
self._log(" --> Applying Principal Component Analysis...", 2)
if self.scaler_:
self._log(" --> Scaling features...", 2)
- X = self.scaler_.transform(X)
+ Xt = cast(pd.DataFrame, self.scaler_.transform(Xt))
- X = self._estimator.transform(X).iloc[:, :self._estimator._comps]
+ Xt = self._estimator.transform(Xt).iloc[:, :self._estimator._comps]
var = np.array(self._estimator.explained_variance_ratio_[:self._n_features])
self._log(f" --> Keeping {self._estimator._comps} components.", 2)
@@ -1571,7 +1521,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
mask = self._estimator.get_support()
self._log(f" --> {self.strategy} selected {sum(mask)} features from the dataset.", 2)
- for n, column in enumerate(X):
+ for n, column in enumerate(Xt):
if not mask[n]:
if hasattr(self._estimator, "ranking_"):
self._log(
@@ -1580,7 +1530,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
)
else:
self._log(f" --> Dropping feature {column}.", 2)
- X = X.drop(columns=column)
+ Xt = Xt.drop(columns=column)
else: # Advanced strategies
self._log(
@@ -1588,9 +1538,9 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
"features from the dataset.", 2,
)
- for column in X:
+ for column in Xt:
if column not in self._estimator.best_feature_list:
self._log(f" --> Dropping feature {column}.", 2)
- X = X.drop(columns=column)
+ Xt = Xt.drop(columns=column)
- return X
+ return self._convert(Xt)
diff --git a/atom/models/classreg.py b/atom/models/classreg.py
index 9dfac9c04..02dffde14 100644
--- a/atom/models/classreg.py
+++ b/atom/models/classreg.py
@@ -7,6 +7,7 @@
from __future__ import annotations
+from collections.abc import Mapping
from typing import Any, ClassVar, cast
import numpy as np
@@ -22,7 +23,7 @@
from optuna.trial import Trial
from atom.basemodel import BaseModel
-from atom.utils.types import DataFrame, Pandas, Predictor
+from atom.utils.types import Pandas, Predictor
from atom.utils.utils import CatBMetric, Goal, LGBMetric, XGBMetric
@@ -76,7 +77,7 @@ class AdaBoost(BaseModel):
"regression": "sklearn.ensemble.AdaBoostRegressor",
}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -485,8 +486,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
def _fit_estimator(
self,
estimator: Predictor,
- data: tuple[DataFrame, Pandas],
- validation: tuple[DataFrame, Pandas] | None = None,
+ data: tuple[pd.DataFrame, Pandas],
+ validation: tuple[pd.DataFrame, Pandas] | None = None,
trial: Trial | None = None,
):
"""Fit the estimator and perform in-training validation.
@@ -734,7 +735,7 @@ class DecisionTree(BaseModel):
"regression": "sklearn.tree.DecisionTreeRegressor",
}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -809,7 +810,7 @@ class Dummy(BaseModel):
"regression": "sklearn.dummy.DummyRegressor",
}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -943,7 +944,7 @@ class ExtraTree(BaseModel):
"regression": "sklearn.tree.ExtraTreeRegressor",
}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -1038,7 +1039,7 @@ def _get_parameters(self, trial: Trial) -> dict:
return params
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -1229,7 +1230,7 @@ class GradientBoostingMachine(BaseModel):
"regression": "sklearn.ensemble.GradientBoostingRegressor",
}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -1376,7 +1377,7 @@ class HistGradientBoosting(BaseModel):
"regression": "sklearn.ensemble.HistGradientBoostingRegressor",
}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -1452,7 +1453,7 @@ class KNearestNeighbors(BaseModel):
"regression": "sklearn.neighbors.KNeighborsRegressor",
}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -1675,8 +1676,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
def _fit_estimator(
self,
estimator: Predictor,
- data: tuple[DataFrame, Pandas],
- validation: tuple[DataFrame, Pandas] | None = None,
+ data: tuple[pd.DataFrame, Pandas],
+ validation: tuple[pd.DataFrame, Pandas] | None = None,
trial: Trial | None = None,
):
"""Fit the estimator and perform in-training validation.
@@ -1951,7 +1952,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
else:
return super()._get_est(params)
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -2053,7 +2054,7 @@ def _get_parameters(self, trial: Trial) -> dict:
return params
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -2161,7 +2162,7 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]:
return params
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -2400,7 +2401,7 @@ class PassiveAggressive(BaseModel):
"regression": "sklearn.linear_model.PassiveAggressiveRegressor",
}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -2712,7 +2713,7 @@ def _get_parameters(self, trial: Trial) -> dict:
return params
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -2805,7 +2806,7 @@ class Ridge(BaseModel):
"regression": "sklearn.linear_model.Ridge",
}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -2878,7 +2879,7 @@ class StochasticGradientDescent(BaseModel):
"regression": "sklearn.linear_model.SGDRegressor",
}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -3003,7 +3004,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
else:
return super()._get_est(params)
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -3132,8 +3133,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
def _fit_estimator(
self,
estimator: Predictor,
- data: tuple[DataFrame, Pandas],
- validation: tuple[DataFrame, Pandas] | None = None,
+ data: tuple[pd.DataFrame, Pandas],
+ validation: tuple[pd.DataFrame, Pandas] | None = None,
trial: Trial | None = None,
):
"""Fit the estimator and perform in-training validation.
diff --git a/atom/models/custom.py b/atom/models/custom.py
index 6c9e49495..ae10fe3be 100644
--- a/atom/models/custom.py
+++ b/atom/models/custom.py
@@ -5,7 +5,6 @@
"""
-from functools import cached_property
from typing import Any
from atom.basemodel import BaseModel
@@ -56,7 +55,7 @@ def fullname(self) -> str:
"""Return the estimator's class name."""
return self._est_class.__name__
- @cached_property
+ @property
def _est_class(self) -> type[Predictor]:
"""Return the estimator's class."""
return self._est
diff --git a/atom/models/ts.py b/atom/models/ts.py
index 59f078f1b..f067eb5d9 100644
--- a/atom/models/ts.py
+++ b/atom/models/ts.py
@@ -7,6 +7,7 @@
from __future__ import annotations
+from collections.abc import Mapping
from logging import ERROR, WARNING, getLogger
from typing import Any, ClassVar
@@ -161,7 +162,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
"""
return super()._get_est({"suppress_warnings": self.warnings == "ignore"} | params)
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -835,7 +836,7 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]:
return {"stl_kwargs": self._est_params.get("stl_kwargs", {}) | params}
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -1207,7 +1208,7 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]:
return params
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
@@ -1652,7 +1653,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
"""
return super()._get_est({"suppress_warnings": self.warnings == "ignore"} | params)
- def _get_distributions(self) -> dict[str, BaseDistribution]:
+ def _get_distributions(self) -> Mapping[str, BaseDistribution]:
"""Get the predefined hyperparameter distributions.
Returns
diff --git a/atom/nlp.py b/atom/nlp.py
index 3e05e57fb..392124eb9 100644
--- a/atom/nlp.py
+++ b/atom/nlp.py
@@ -10,35 +10,31 @@
import re
import unicodedata
from string import punctuation
+from typing import TYPE_CHECKING
-import nltk
import numpy as np
import pandas as pd
from beartype import beartype
-from nltk.collocations import (
- BigramCollocationFinder, QuadgramCollocationFinder,
- TrigramCollocationFinder,
-)
-from nltk.corpus import wordnet
-from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.base import OneToOneFeatureMixin
-from sklearn.utils._set_output import _SetOutputMixin
from sklearn.utils.validation import _check_feature_names_in
from typing_extensions import Self
from atom.data_cleaning import TransformerMixin
from atom.utils.types import (
- Bool, DataFrame, Engine, FloatLargerZero, Pandas, Sequence,
- VectorizerStarts, Verbose, bool_t,
+ Bool, Engine, FloatLargerZero, Sequence, VectorizerStarts, Verbose,
+ XConstructor, XReturn, YConstructor, bool_t,
)
from atom.utils.utils import (
- check_is_fitted, check_nltk_module, composed, crash, get_corpus, is_sparse,
- merge, method_to_log, to_df,
+ check_is_fitted, check_nltk_module, get_corpus, is_sparse, merge, to_df,
)
+if TYPE_CHECKING:
+ from nltk.corpus import wordnet
+
+
@beartype
-class TextCleaner(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
+class TextCleaner(TransformerMixin, OneToOneFeatureMixin):
r"""Applies standard text cleaning to the corpus.
Transformations include normalizing characters and dropping
@@ -193,8 +189,7 @@ def __init__(
self.regex_number = regex_number
self.drop_punctuation = drop_punctuation
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Apply the transformations to the data.
Parameters
@@ -204,7 +199,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
not a dataframe, it should be composed of a single feature
containing the text documents.
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -245,28 +240,29 @@ def drop_regex(regex: str):
Regex pattern to replace.
"""
- if isinstance(X[corpus].iloc[0], str):
- X[corpus] = X[corpus].str.replace(regex, "", regex=True)
+ if isinstance(Xt[corpus].iloc[0], str):
+ Xt[corpus] = Xt[corpus].str.replace(regex, "", regex=True)
else:
- X[corpus] = X[corpus].apply(lambda x: [re.sub(regex, "", w) for w in x])
+ Xt[corpus] = Xt[corpus].apply(lambda x: [re.sub(regex, "", w) for w in x])
- corpus = get_corpus(X)
+ Xt = to_df(X, columns=getattr(self, "feature_names_in_", None))
+ corpus = get_corpus(Xt)
self._log("Cleaning the corpus...", 1)
if self.decode:
- if isinstance(X[corpus].iloc[0], str):
- X[corpus] = X[corpus].apply(lambda x: to_ascii(x))
+ if isinstance(Xt[corpus].iloc[0], str):
+ Xt[corpus] = Xt[corpus].apply(lambda x: to_ascii(x))
else:
- X[corpus] = X[corpus].apply(lambda doc: [to_ascii(str(w)) for w in doc])
+ Xt[corpus] = Xt[corpus].apply(lambda doc: [to_ascii(str(w)) for w in doc])
self._log(" --> Decoding unicode characters to ascii.", 2)
if self.lower_case:
self._log(" --> Converting text to lower case.", 2)
- if isinstance(X[corpus].iloc[0], str):
- X[corpus] = X[corpus].str.lower()
+ if isinstance(Xt[corpus].iloc[0], str):
+ Xt[corpus] = Xt[corpus].str.lower()
else:
- X[corpus] = X[corpus].apply(lambda doc: [str(w).lower() for w in doc])
+ Xt[corpus] = Xt[corpus].apply(lambda doc: [str(w).lower() for w in doc])
if self.drop_email:
if not self.regex_email:
@@ -306,21 +302,21 @@ def drop_regex(regex: str):
if self.drop_punctuation:
self._log(" --> Dropping punctuation from the text.", 2)
trans_table = str.maketrans("", "", punctuation) # Translation table
- if isinstance(X[corpus].iloc[0], str):
+ if isinstance(Xt[corpus].iloc[0], str):
func = lambda doc: doc.translate(trans_table)
else:
func = lambda doc: [str(w).translate(trans_table) for w in doc]
- X[corpus] = X[corpus].apply(func)
+ Xt[corpus] = Xt[corpus].apply(func)
# Drop empty tokens from every document
- if not isinstance(X[corpus].iloc[0], str):
- X[corpus] = X[corpus].apply(lambda doc: [w for w in doc if w])
+ if not isinstance(Xt[corpus].iloc[0], str):
+ Xt[corpus] = Xt[corpus].apply(lambda doc: [w for w in doc if w])
- return X
+ return self._convert(Xt)
@beartype
-class TextNormalizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
+class TextNormalizer(TransformerMixin, OneToOneFeatureMixin):
"""Normalize the corpus.
Convert words to a more uniform standard. The transformations
@@ -444,8 +440,7 @@ def __init__(
self.stem = stem
self.lemmatize = lemmatize
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Normalize the text.
Parameters
@@ -455,7 +450,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
not a dataframe, it should be composed of a single feature
containing the text documents.
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -488,31 +483,36 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN:
else: # "NN", "NNS", "NNP", "NNPS"
return wordnet.NOUN
- corpus = get_corpus(X)
+ from nltk import pos_tag
+ from nltk.corpus import stopwords, wordnet
+ from nltk.stem import SnowballStemmer, WordNetLemmatizer
+
+ Xt = to_df(X, columns=getattr(self, "feature_names_in_", None))
+ corpus = get_corpus(Xt)
self._log("Normalizing the corpus...", 1)
# If the corpus is not tokenized, separate by space
- if isinstance(X[corpus].iloc[0], str):
- X[corpus] = X[corpus].apply(lambda row: row.split())
+ if isinstance(Xt[corpus].iloc[0], str):
+ Xt[corpus] = Xt[corpus].apply(lambda row: row.split())
- stopwords = set()
+ stop_words = set()
if self.stopwords:
if isinstance(self.stopwords, bool_t):
self.stopwords = "english"
# Get stopwords from the NLTK library
check_nltk_module("corpora/stopwords", quiet=self.verbose < 2)
- stopwords = set(nltk.corpus.stopwords.words(self.stopwords.lower()))
+ stop_words = set(stopwords.words(self.stopwords.lower()))
# Join predefined with customs stopwords
if self.custom_stopwords is not None:
- stopwords = stopwords | set(self.custom_stopwords)
+ stop_words = stop_words | set(self.custom_stopwords)
- if stopwords:
+ if stop_words:
self._log(" --> Dropping stopwords.", 2)
- f = lambda row: [word for word in row if word not in stopwords]
- X[corpus] = X[corpus].apply(f)
+ f = lambda row: [word for word in row if word not in stop_words]
+ Xt[corpus] = Xt[corpus].apply(f)
if self.stem:
if isinstance(self.stem, bool_t):
@@ -520,7 +520,7 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN:
self._log(" --> Applying stemming.", 2)
ss = SnowballStemmer(language=self.stem.lower())
- X[corpus] = X[corpus].apply(lambda row: [ss.stem(word) for word in row])
+ Xt[corpus] = Xt[corpus].apply(lambda row: [ss.stem(word) for word in row])
if self.lemmatize:
self._log(" --> Applying lemmatization.", 2)
@@ -529,14 +529,14 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN:
check_nltk_module("corpora/omw-1.4", quiet=self.verbose < 2)
wnl = WordNetLemmatizer()
- f = lambda row: [wnl.lemmatize(w, pos(tag)) for w, tag in nltk.pos_tag(row)]
- X[corpus] = X[corpus].apply(f)
+ f = lambda row: [wnl.lemmatize(w, pos(tag)) for w, tag in pos_tag(row)]
+ Xt[corpus] = Xt[corpus].apply(f)
- return X
+ return self._convert(Xt)
@beartype
-class Tokenizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin):
+class Tokenizer(TransformerMixin, OneToOneFeatureMixin):
"""Tokenize the corpus.
Convert documents into sequences of words. Additionally,
@@ -664,8 +664,7 @@ def __init__(
self.trigram_freq = trigram_freq
self.quadgram_freq = quadgram_freq
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Tokenize the text.
Parameters
@@ -675,7 +674,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
not a dataframe, it should be composed of a single feature
containing the text documents.
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -712,24 +711,28 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]:
return row_c[2:-2].split(sep)
- corpus = get_corpus(X)
+ import nltk.collocations as collocations
+ from nltk import word_tokenize
+
+ Xt = to_df(X, columns=getattr(self, "feature_names_in_", None))
+ corpus = get_corpus(Xt)
self._log("Tokenizing the corpus...", 1)
- if isinstance(X[corpus].iloc[0], str):
+ if isinstance(Xt[corpus].iloc[0], str):
check_nltk_module("tokenizers/punkt", quiet=self.verbose < 2)
- X[corpus] = X[corpus].apply(lambda row: nltk.word_tokenize(row))
+ Xt[corpus] = Xt[corpus].apply(lambda row: word_tokenize(row))
ngrams = {
- "bigrams": BigramCollocationFinder,
- "trigrams": TrigramCollocationFinder,
- "quadgrams": QuadgramCollocationFinder,
+ "bigrams": collocations.BigramCollocationFinder,
+ "trigrams": collocations.TrigramCollocationFinder,
+ "quadgrams": collocations.QuadgramCollocationFinder,
}
for attr, finder in ngrams.items():
if frequency := getattr(self, f"{attr[:-1]}_freq"):
# Search for all n-grams in the corpus
- ngram_fd = finder.from_documents(X[corpus]).ngram_fd
+ ngram_fd = finder.from_documents(Xt[corpus]).ngram_fd
if frequency < 1:
frequency = int(frequency * len(ngram_fd))
@@ -740,7 +743,7 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]:
if freq >= frequency:
occur += 1
counts += freq
- X[corpus] = X[corpus].apply(replace_ngrams, args=(ngram,))
+ Xt[corpus] = Xt[corpus].apply(replace_ngrams, args=(ngram,))
rows.append({attr[:-1]: "_".join(ngram), "frequency": freq})
if rows:
@@ -752,11 +755,11 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]:
else:
self._log(f" --> No {attr} found in the corpus.", 2)
- return X
+ return self._convert(Xt)
@beartype
-class Vectorizer(TransformerMixin, _SetOutputMixin):
+class Vectorizer(TransformerMixin):
"""Vectorize text data.
Transform the corpus into meaningful vectors of numbers. The
@@ -792,24 +795,12 @@ class Vectorizer(TransformerMixin, _SetOutputMixin):
`#!python device="gpu"` to use the GPU. Read more in the
[user guide][gpu-acceleration].
- engine: str, dict or None, default=None
- Execution engine to use for [data][data-acceleration] and
- [estimators][estimator-acceleration]. The value should be
- one of the possible values to change one of the two engines,
- or a dictionary with keys `data` and `estimator`, with their
- corresponding choice as values to change both engines. If
- None, the default values are used. Choose from:
-
- - "data":
+ engine: str or None, default=None
+ Execution engine to use for [estimators][estimator-acceleration].
+ If None, the default value is used. Choose from:
- - "pandas" (default)
- - "pyarrow"
- - "modin"
-
- - "estimator":
-
- - "sklearn" (default)
- - "cuml"
+ - "sklearn" (default)
+ - "cuml"
verbose: int, default=0
Verbosity level of the class. Choose from:
@@ -923,8 +914,7 @@ def _get_corpus_columns(self) -> list[str]:
"The get_feature_names_out method is not available for strategy='hashing'."
)
- @composed(crash, method_to_log)
- def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
+ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self:
"""Fit to data.
Parameters
@@ -934,7 +924,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
not a dataframe, it should be composed of a single feature
containing the text documents.
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -943,11 +933,15 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
Estimator instance.
"""
- self._corpus = get_corpus(X)
+ Xt = to_df(X)
+ self._corpus = get_corpus(Xt)
+
+ self._check_feature_names(Xt, reset=True)
+ self._check_n_features(Xt, reset=True)
# Convert a sequence of tokens to space separated string
- if not isinstance(X[self._corpus].iloc[0], str):
- X[self._corpus] = X[self._corpus].apply(lambda row: " ".join(row))
+ if not isinstance(Xt[self._corpus].iloc[0], str):
+ Xt[self._corpus] = Xt[self._corpus].apply(lambda row: " ".join(row))
strategies = {
"bow": "CountVectorizer",
@@ -966,7 +960,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
self._estimator.set_output(transform="default")
self._log("Fitting Vectorizer...", 1)
- self._estimator.fit(X[self._corpus])
+ self._estimator.fit(Xt[self._corpus])
# Add the estimator as attribute to the instance
setattr(self, f"{self.strategy}_", self._estimator)
@@ -994,8 +988,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) ->
og_columns = [c for c in self.feature_names_in_ if c != self._corpus]
return np.array(og_columns + self._get_corpus_columns())
- @composed(crash, method_to_log)
- def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
+ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn:
"""Vectorize the text.
Parameters
@@ -1005,7 +998,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
not a dataframe, it should be composed of a single feature
containing the text documents.
- y: int, str, sequence, dataframe-like or None, default=None
+ y: sequence, dataframe-like or None, default=None
Do nothing. Implemented for continuity of the API.
Returns
@@ -1014,14 +1007,18 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
Transformed corpus.
"""
+ check_is_fitted(self)
+
+ Xt = to_df(X, columns=self.feature_names_in_)
+
self._log("Vectorizing the corpus...", 1)
# Convert a sequence of tokens to space-separated string
- if not isinstance(X[self._corpus].iloc[0], str):
- X[self._corpus] = X[self._corpus].apply(lambda row: " ".join(row))
+ if not isinstance(Xt[self._corpus].iloc[0], str):
+ Xt[self._corpus] = Xt[self._corpus].apply(lambda row: " ".join(row))
- matrix = self._estimator.transform(X[self._corpus])
- X = X.drop(columns=self._corpus) # Drop original corpus column
+ matrix = self._estimator.transform(Xt[self._corpus])
+ Xt = Xt.drop(columns=self._corpus) # Drop original corpus column
if "sklearn" not in self._estimator.__class__.__module__:
matrix = matrix.get() # Convert cupy sparse array back to scipy
@@ -1029,7 +1026,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
if not self.return_sparse:
self._log(" --> Converting the output to a full array.", 2)
matrix = matrix.toarray()
- elif not X.empty and not is_sparse(X):
+ elif not Xt.empty and not is_sparse(Xt):
# Raise if there are other columns that are non-sparse
raise ValueError(
"Invalid value for the return_sparse parameter. The value must "
@@ -1042,4 +1039,4 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
# Hashing has no words to put as column names
columns = [f"hash{i}" for i in range(1, matrix.shape[1] + 1)]
- return merge(X, to_df(matrix, index=X.index, columns=columns))
+ return self._convert(merge(Xt, to_df(matrix, index=Xt.index, columns=columns)))
diff --git a/atom/pipeline.py b/atom/pipeline.py
index e09c3578f..d4d57b391 100644
--- a/atom/pipeline.py
+++ b/atom/pipeline.py
@@ -9,9 +9,10 @@
from collections.abc import Iterator
from itertools import islice
-from typing import Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload
import numpy as np
+import pandas as pd
from joblib import Memory
from sklearn.base import clone
from sklearn.pipeline import Pipeline as SkPipeline
@@ -22,19 +23,26 @@
from sklearn.utils.metaestimators import available_if
from sklearn.utils.validation import check_memory
from sktime.forecasting.base import BaseForecaster
-from sktime.proba.normal import Normal
from typing_extensions import Self
from atom.utils.types import (
- Bool, DataFrame, Estimator, FHConstructor, Float, Pandas, Scalar, Sequence,
- Verbose, XConstructor, YConstructor,
+ Bool, EngineDataOptions, EngineTuple, Estimator, FHConstructor, Float,
+ Pandas, Scalar, Sequence, Verbose, XConstructor, XReturn, YConstructor,
+ YReturn,
)
from atom.utils.utils import (
- NotFittedError, adjust_verbosity, check_is_fitted, fit_one,
- fit_transform_one, transform_one, variable_return,
+ NotFittedError, adjust, check_is_fitted, fit_one, fit_transform_one, to_df,
+ to_tabular, transform_one, variable_return,
)
+if TYPE_CHECKING:
+ from sktime.proba.normal import Normal
+
+
+T = TypeVar("T")
+
+
class Pipeline(SkPipeline):
"""Pipeline of transforms with a final estimator.
@@ -55,6 +63,7 @@ class Pipeline(SkPipeline):
and additionally:
- Can initialize with an empty pipeline.
+ - Always returns 'pandas' objects.
- Accepts transformers that drop rows.
- Accepts transformers that only are fitted on a subset of the
provided dataset.
@@ -221,6 +230,35 @@ def _can_inverse_transform(self) -> bool:
for _, _, est in self._iter()
)
+ @overload
+ def _convert(self, obj: Literal[None]) -> None: ...
+
+ @overload
+ def _convert(self, obj: pd.DataFrame) -> XReturn: ...
+
+ @overload
+ def _convert(self, obj: pd.Series) -> YReturn: ...
+
+ def _convert(self, obj: Pandas | None) -> YReturn | None:
+ """Convert data to the type set in the data engine.
+
+ Parameters
+ ----------
+ obj: pd.Series, pd.DataFrame or None
+ Object to convert. If None, return as is.
+
+ Returns
+ -------
+ object
+ Converted data.
+
+ """
+ # Only apply transformations when the engine is defined
+ if hasattr(self, "_engine") and isinstance(obj, pd.Series | pd.DataFrame):
+ return self._engine.data_engine.convert(obj)
+ else:
+ return obj
+
def _iter(
self,
*,
@@ -273,17 +311,17 @@ def _fit(
X: XConstructor | None = None,
y: YConstructor | None = None,
routed_params: dict[str, Bunch] | None = None,
- ) -> tuple[DataFrame | None, Pandas | None]:
+ ) -> tuple[pd.DataFrame | None, Pandas | None]:
"""Get data transformed through the pipeline.
Parameters
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored. None if the pipeline only uses y.
+ `X` is ignored. None if the pipeline only uses y.
- y: dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
routed_params: dict or None, default=None
Metadata parameters routed for the fit method.
@@ -300,6 +338,9 @@ def _fit(
self.steps: list[tuple[str, Estimator]] = list(self.steps)
self._validate_steps()
+ Xt = to_df(X)
+ yt = to_tabular(y, index=getattr(Xt, "index", None))
+
for step, name, transformer in self._iter(
with_final=False, filter_passthrough=False, filter_train_only=False
):
@@ -318,14 +359,14 @@ def _fit(
if hasattr(transformer, attr):
setattr(cloned, attr, getattr(transformer, attr))
- with adjust_verbosity(cloned, self._verbose):
+ with adjust(cloned, verbose=self._verbose):
# Fit or load the current estimator from cache
# Type ignore because routed_params is never None but
# the signature of _fit needs to comply with sklearn's
- X, y, fitted_transformer = self._mem_fit_transform(
+ Xt, yt, fitted_transformer = self._mem_fit_transform(
transformer=cloned,
- X=X,
- y=y,
+ X=Xt,
+ y=yt,
message=self._log_message(step),
**routed_params[name].fit_transform, # type: ignore[index]
)
@@ -334,7 +375,7 @@ def _fit(
# estimator (necessary when loading from cache)
self.steps[step] = (name, fitted_transformer)
- return X, y
+ return Xt, yt
def get_metadata_routing(self):
"""Get metadata routing of this object.
@@ -428,10 +469,10 @@ def fit(
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored.
+ `X` is ignored.
- y: dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
**params
Parameters requested and accepted by steps. Each step must
@@ -445,15 +486,15 @@ def fit(
"""
routed_params = self._check_method_params(method="fit", props=params)
- X, y = self._fit(X, y, routed_params)
+ Xt, yt = self._fit(X, y, routed_params)
with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
if self._final_estimator is not None and self._final_estimator != "passthrough":
- with adjust_verbosity(self._final_estimator, self._verbose):
+ with adjust(self._final_estimator, verbose=self._verbose):
self._mem_fit(
estimator=self._final_estimator,
- X=X,
- y=y,
+ X=Xt,
+ y=yt,
**routed_params[self.steps[-1][0]].fit,
)
@@ -465,7 +506,7 @@ def fit_transform(
X: XConstructor | None = None,
y: YConstructor | None = None,
**params,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Fit the pipeline and transform the data.
Call `fit` followed by `transform` on each transformer in the
@@ -479,11 +520,11 @@ def fit_transform(
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored. None
+ `X` is ignored. None
if the estimator only uses y.
- y: dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
**params
Parameters requested and accepted by steps. Each step must
@@ -500,21 +541,21 @@ def fit_transform(
"""
routed_params = self._check_method_params(method="fit_transform", props=params)
- X, y = self._fit(X, y, routed_params)
+ Xt, yt = self._fit(X, y, routed_params)
with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
if self._final_estimator is None or self._final_estimator == "passthrough":
- return variable_return(X, y)
+ return variable_return(Xt, yt)
- with adjust_verbosity(self._final_estimator, self._verbose):
- X, y, _ = self._mem_fit_transform(
+ with adjust(self._final_estimator, verbose=self._verbose):
+ Xt, yt, _ = self._mem_fit_transform(
transformer=self._final_estimator,
- X=X,
- y=y,
+ X=Xt,
+ y=yt,
**routed_params[self.steps[-1][0]].fit_transform,
)
- return variable_return(X, y)
+ return variable_return(self._convert(Xt), self._convert(yt))
@available_if(_can_transform)
def transform(
@@ -524,7 +565,7 @@ def transform(
*,
filter_train_only: Bool = True,
**params,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Transform the data.
Call `transform` on each transformer in the pipeline. The
@@ -538,10 +579,10 @@ def transform(
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored. None if the pipeline only uses y.
+ `X` is ignored. None if the pipeline only uses y.
- y: dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
filter_train_only: bool, default=True
Whether to exclude transformers that should only be used
@@ -564,19 +605,22 @@ def transform(
if X is None and y is None:
raise ValueError("X and y cannot be both None.")
+ Xt = to_df(X)
+ yt = to_tabular(y, index=getattr(Xt, "index", None))
+
_raise_for_params(params, self, "transform")
routed_params = process_routing(self, "transform", **params)
for _, name, transformer in self._iter(filter_train_only=filter_train_only):
- with adjust_verbosity(transformer, self._verbose):
- X, y = self._mem_transform(
+ with adjust(transformer, verbose=self._verbose):
+ Xt, yt = self._mem_transform(
transformer=transformer,
- X=X,
- y=y,
+ X=Xt,
+ y=yt,
**routed_params[name].transform,
)
- return variable_return(X, y)
+ return variable_return(self._convert(Xt), self._convert(yt))
@available_if(_can_inverse_transform)
def inverse_transform(
@@ -586,7 +630,7 @@ def inverse_transform(
*,
filter_train_only: Bool = True,
**params,
- ) -> Pandas | tuple[DataFrame, Pandas]:
+ ) -> YReturn | tuple[XReturn, YReturn]:
"""Inverse transform for each step in a reverse order.
All estimators in the pipeline must implement the
@@ -596,10 +640,10 @@ def inverse_transform(
----------
X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored. None if the pipeline only uses y.
+ `X` is ignored. None if the pipeline only uses y.
- y: dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ y: sequence, dataframe-like or None, default=None
+ Target column(s) corresponding to `X`.
filter_train_only: bool, default=True
Whether to exclude transformers that should only be used
@@ -622,21 +666,24 @@ def inverse_transform(
if X is None and y is None:
raise ValueError("X and y cannot be both None.")
+ Xt = to_df(X)
+ yt = to_tabular(y, index=getattr(Xt, "index", None))
+
_raise_for_params(params, self, "inverse_transform")
routed_params = process_routing(self, "inverse_transform", **params)
reverse_iter = reversed(list(self._iter(filter_train_only=filter_train_only)))
for _, name, transformer in reverse_iter:
- with adjust_verbosity(transformer, self._verbose):
- X, y = self._mem_transform(
+ with adjust(transformer, verbose=self._verbose):
+ Xt, yt = self._mem_transform(
transformer=transformer,
- X=X,
- y=y,
+ X=Xt,
+ y=yt,
method="inverse_transform",
**routed_params[name].inverse_transform,
)
- return variable_return(X, y)
+ return variable_return(self._convert(Xt), self._convert(yt))
@available_if(_final_estimator_has("decision_function"))
def decision_function(self, X: XConstructor, **params) -> np.ndarray:
@@ -661,20 +708,22 @@ def decision_function(self, X: XConstructor, **params) -> np.ndarray:
multiclass classification tasks.
"""
+ Xt = to_df(X)
+
_raise_for_params(params, self, "decision_function")
routed_params = process_routing(self, "decision_function", **params)
for _, name, transformer in self._iter(with_final=False):
- with adjust_verbosity(transformer, self._verbose):
- X, _ = self._mem_transform(
+ with adjust(transformer, verbose=self._verbose):
+ Xt, _ = self._mem_transform(
transformer=transformer,
- X=X,
+ X=Xt,
**routed_params.get(name, {}).get("transform", {}),
)
return self.steps[-1][1].decision_function(
- X, **routed_params.get(self.steps[-1][0], {}).get("decision_function", {})
+ Xt, **routed_params.get(self.steps[-1][0], {}).get("decision_function", {})
)
@available_if(_final_estimator_has("predict"))
@@ -715,19 +764,21 @@ def predict(
if X is None and fh is None:
raise ValueError("X and fh cannot be both None.")
+ Xt = to_df(X)
+
routed_params = process_routing(self, "predict", **params)
for _, name, transformer in self._iter(with_final=False):
- with adjust_verbosity(transformer, self._verbose):
- X, _ = self._mem_transform(transformer, X, **routed_params[name].transform)
+ with adjust(transformer, verbose=self._verbose):
+ Xt, _ = self._mem_transform(transformer, Xt, **routed_params[name].transform)
if isinstance(self._final_estimator, BaseForecaster):
if fh is None:
raise ValueError("The fh parameter cannot be None for forecasting estimators.")
- return self.steps[-1][1].predict(fh=fh, X=X)
+ return self.steps[-1][1].predict(fh=fh, X=Xt)
else:
- return self.steps[-1][1].predict(X, **routed_params[self.steps[-1][0]].predict)
+ return self.steps[-1][1].predict(Xt, **routed_params[self.steps[-1][0]].predict)
@available_if(_final_estimator_has("predict_interval"))
def predict_interval(
@@ -736,7 +787,7 @@ def predict_interval(
X: XConstructor | None = None,
*,
coverage: Float | Sequence[Float] = 0.9,
- ) -> Pandas:
+ ) -> pd.DataFrame:
"""Transform, then predict_quantiles of the final estimator.
Parameters
@@ -757,11 +808,13 @@ def predict_interval(
Computed interval forecasts.
"""
+ Xt = to_df(X)
+
for _, _, transformer in self._iter(with_final=False):
- with adjust_verbosity(transformer, self._verbose):
- X, y = self._mem_transform(transformer, X)
+ with adjust(transformer, verbose=self._verbose):
+ Xt, _ = self._mem_transform(transformer, Xt)
- return self.steps[-1][1].predict_interval(fh=fh, X=X, coverage=coverage)
+ return self.steps[-1][1].predict_interval(fh=fh, X=Xt, coverage=coverage)
@available_if(_final_estimator_has("predict_log_proba"))
def predict_log_proba(self, X: XConstructor, **params) -> np.ndarray:
@@ -784,14 +837,16 @@ def predict_log_proba(self, X: XConstructor, **params) -> np.ndarray:
n_classes) or a list of arrays for [multioutput tasks][].
"""
+ Xt = to_df(X)
+
routed_params = process_routing(self, "predict_log_proba", **params)
for _, name, transformer in self._iter(with_final=False):
- with adjust_verbosity(transformer, self._verbose):
- X, _ = self._mem_transform(transformer, X, **routed_params[name].transform)
+ with adjust(transformer, verbose=self._verbose):
+ Xt, _ = self._mem_transform(transformer, Xt, **routed_params[name].transform)
return self.steps[-1][1].predict_log_proba(
- X, **routed_params[self.steps[-1][0]].predict_log_proba
+ Xt, **routed_params[self.steps[-1][0]].predict_log_proba
)
@available_if(_final_estimator_has("predict_proba"))
@@ -838,20 +893,22 @@ def predict_proba(
if X is None and fh is None:
raise ValueError("X and fh cannot be both None.")
+ Xt = to_df(X)
+
routed_params = process_routing(self, "predict_proba", **params)
for _, name, transformer in self._iter(with_final=False):
- with adjust_verbosity(transformer, self._verbose):
- X, _ = self._mem_transform(transformer, X, **routed_params[name].transform)
+ with adjust(transformer, verbose=self._verbose):
+ Xt, _ = self._mem_transform(transformer, Xt, **routed_params[name].transform)
if isinstance(self._final_estimator, BaseForecaster):
if fh is None:
raise ValueError("The fh parameter cannot be None for forecasting estimators.")
- return self.steps[-1][1].predict_proba(fh=fh, X=X, marginal=marginal)
+ return self.steps[-1][1].predict_proba(fh=fh, X=Xt, marginal=marginal)
else:
return self.steps[-1][1].predict_proba(
- X, **routed_params[self.steps[-1][0]].predict_proba
+ Xt, **routed_params[self.steps[-1][0]].predict_proba
)
@available_if(_final_estimator_has("predict_quantiles"))
@@ -883,11 +940,13 @@ def predict_quantiles(
Computed quantile forecasts.
"""
+ Xt = to_df(X)
+
for _, _, transformer in self._iter(with_final=False):
- with adjust_verbosity(transformer, self._verbose):
- X, y = self._mem_transform(transformer, X)
+ with adjust(transformer, verbose=self._verbose):
+ Xt, _ = self._mem_transform(transformer, Xt)
- return self.steps[-1][1].predict_quantiles(fh=fh, X=X, alpha=alpha)
+ return self.steps[-1][1].predict_quantiles(fh=fh, X=Xt, alpha=alpha)
@available_if(_final_estimator_has("predict_residuals"))
def predict_residuals(
@@ -912,11 +971,14 @@ def predict_residuals(
n_targets) for [multivariate][] tasks.
"""
+ Xt = to_df(X)
+ yt = to_tabular(y, index=getattr(Xt, "index", None))
+
for _, _, transformer in self._iter(with_final=False):
- with adjust_verbosity(transformer, self._verbose):
- X, y = self._mem_transform(transformer, X, y)
+ with adjust(transformer, verbose=self._verbose):
+ Xt, yt = self._mem_transform(transformer, Xt, yt)
- return self.steps[-1][1].predict_residuals(y=y, X=X)
+ return self.steps[-1][1].predict_residuals(y=yt, X=Xt)
@available_if(_final_estimator_has("predict_var"))
def predict_var(
@@ -925,7 +987,7 @@ def predict_var(
X: XConstructor | None = None,
*,
cov: Bool = False,
- ) -> DataFrame:
+ ) -> pd.DataFrame:
"""Transform, then predict_var of the final estimator.
Parameters
@@ -947,11 +1009,49 @@ def predict_var(
Computed variance forecasts.
"""
+ Xt = to_df(X)
+
for _, _, transformer in self._iter(with_final=False):
- with adjust_verbosity(transformer, self._verbose):
- X, _ = self._mem_transform(transformer, X)
+ with adjust(transformer, verbose=self._verbose):
+ Xt, _ = self._mem_transform(transformer, Xt)
+
+ return self.steps[-1][1].predict_var(fh=fh, X=Xt, cov=cov)
- return self.steps[-1][1].predict_var(fh=fh, X=X, cov=cov)
+ def set_output(self, *, transform: EngineDataOptions | None = None) -> Self:
+ """Set output container.
+
+ See sklearn's [user guide][set_output] on how to use the
+ `set_output` API. See [here][data-acceleration] a description
+ of the choices.
+
+ Parameters
+ ----------
+ transform: str or None, default=None
+ Configure the output of the `transform`, `fit_transform`,
+ and `inverse_transform` method. If None, the configuration
+ is not changed. Choose from:
+
+ - "numpy"
+ - "pandas" (default)
+ - "pandas-pyarrow"
+ - "polars"
+ - "polars-lazy"
+ - "pyarrow"
+ - "modin"
+ - "dask"
+ - "pyspark"
+ - "pyspark-pandas"
+
+ Returns
+ -------
+ Self
+ Estimator instance.
+
+ """
+ if transform is not None:
+ self._engine = EngineTuple(data=transform)
+
+ return self
@available_if(_final_estimator_has("score"))
def score(
@@ -971,7 +1071,7 @@ def score(
Feature set with shape=(n_samples, n_features). Can only
be `None` for [forecast][time-series] tasks.
- y: dict, sequence, dataframe or None, default=None
+ y: sequence, dataframe-like or None, default=None
Target values corresponding to `X`.
fh: int, sequence, [ForecastingHorizon][] or None, default=None
@@ -992,6 +1092,9 @@ def score(
if X is None and y is None:
raise ValueError("X and y cannot be both None.")
+ Xt = to_df(X)
+ yt = to_tabular(y, index=getattr(Xt, "index", None))
+
# Drop sample weights if sktime estimator
if not isinstance(self._final_estimator, BaseForecaster):
params["sample_weight"] = sample_weight
@@ -999,10 +1102,10 @@ def score(
routed_params = process_routing(self, "score", **params)
for _, name, transformer in self._iter(with_final=False):
- with adjust_verbosity(transformer, self._verbose):
- X, y = self._mem_transform(transformer, X, y, **routed_params[name].transform)
+ with adjust(transformer, verbose=self._verbose):
+ Xt, yt = self._mem_transform(transformer, Xt, yt, **routed_params[name].transform)
if isinstance(self._final_estimator, BaseForecaster):
- return self.steps[-1][1].score(y=y, X=X, fh=fh)
+ return self.steps[-1][1].score(y=yt, X=Xt, fh=fh)
else:
- return self.steps[-1][1].score(X, y, **routed_params[self.steps[-1][0]].score)
+ return self.steps[-1][1].score(Xt, yt, **routed_params[self.steps[-1][0]].score)
diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py
index 5ea7aa3ca..d23b16e82 100644
--- a/atom/plots/baseplot.py
+++ b/atom/plots/baseplot.py
@@ -15,6 +15,7 @@
import matplotlib.pyplot as plt
import numpy as np
+import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from beartype import beartype
@@ -25,9 +26,9 @@
from atom.plots.basefigure import BaseFigure
from atom.utils.constants import PALETTE
from atom.utils.types import (
- Bool, DataFrame, FloatLargerZero, FloatZeroToOneExc, Index, Int,
- IntLargerZero, Legend, MetricSelector, Model, ModelsSelector, PlotBackend,
- RowSelector, Scalar, Sequence, int_t, sequence_t,
+ Bool, FloatLargerZero, FloatZeroToOneExc, Int, IntLargerZero, Legend,
+ MetricSelector, Model, ModelsSelector, Pandas, PlotBackend, RowSelector,
+ Scalar, Sequence, int_t, sequence_t,
)
from atom.utils.utils import (
Aesthetics, check_is_fitted, composed, crash, get_custom_scorer, lst,
@@ -139,7 +140,7 @@ def marker_size(self, value: FloatLargerZero):
# Methods ====================================================== >>
@staticmethod
- def _get_plot_index(df: DataFrame) -> Index:
+ def _get_plot_index(obj: Pandas) -> pd.Index:
"""Return the dataset's index in a plottable format.
Plotly does not accept all index formats (e.g., pd.Period),
@@ -148,19 +149,19 @@ def _get_plot_index(df: DataFrame) -> Index:
Parameters
----------
- df: dataframe
+ obj: pd.Series or pd.DataFrame
Data set to get the index from.
Returns
-------
- index
+ pd.Index
Index in an acceptable format.
"""
- if hasattr(df.index, "to_timestamp"):
- return df.index.to_timestamp()
+ if hasattr(obj.index, "to_timestamp"):
+ return obj.index.to_timestamp()
else:
- return df.index
+ return obj.index
@staticmethod
def _get_show(show: IntLargerZero | None, maximum: IntLargerZero = 200) -> Int:
diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py
index 215bb488d..f8e2ceae5 100644
--- a/atom/plots/dataplot.py
+++ b/atom/plots/dataplot.py
@@ -30,8 +30,8 @@
from atom.plots.baseplot import BasePlot
from atom.utils.constants import PALETTE
from atom.utils.types import (
- Bool, ColumnSelector, DataFrame, Int, IntLargerZero, Legend, PACFMethods,
- RowSelector, Segment, Sequence, Series, TargetSelector,
+ Bool, ColumnSelector, Int, IntLargerZero, Legend, PACFMethods, RowSelector,
+ Segment, Sequence, TargetSelector,
)
from atom.utils.utils import (
check_dependency, crash, divide, get_corpus, has_task, lst,
@@ -540,7 +540,7 @@ def plot_components(
@crash
def plot_correlation(
self,
- columns: Segment | Sequence[Int | str] | DataFrame | None = None,
+ columns: Segment | Sequence[Int | str] | pd.DataFrame | None = None,
method: Literal["pearson", "kendall", "spearman"] = "pearson",
*,
title: str | dict[str, Any] | None = None,
@@ -1223,7 +1223,7 @@ def plot_ngrams(
"""
- def get_text(column: Series) -> Series:
+ def get_text(column: pd.Series) -> pd.Series:
"""Get the complete corpus as sequence of tokens.
Parameters
@@ -1862,7 +1862,7 @@ def plot_qq(
@crash
def plot_relationships(
self,
- columns: Segment | Sequence[Int | str] | DataFrame = (0, 1, 2),
+ columns: Segment | Sequence[Int | str] | pd.DataFrame = (0, 1, 2),
*,
title: str | dict[str, Any] | None = None,
legend: Legend | dict[str, Any] | None = None,
diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py
index d23499cfe..038e7f86d 100644
--- a/atom/plots/hyperparametertuningplot.py
+++ b/atom/plots/hyperparametertuningplot.py
@@ -14,6 +14,7 @@
from typing import Any
import numpy as np
+import pandas as pd
import plotly.graph_objects as go
from optuna.importance import FanovaImportanceEvaluator
from optuna.trial import TrialState
@@ -32,7 +33,7 @@
int_t, segment_t,
)
from atom.utils.utils import (
- bk, check_dependency, crash, divide, get_segment, it, lst, rnd,
+ check_dependency, crash, divide, get_segment, it, lst, rnd,
)
@@ -244,8 +245,8 @@ def plot_edf(
models_c = self._check_hyperparams(models_c)
metric_c = self._get_metric(metric)
- x_min = bk.concat([m.trials[metric_c] for m in models_c]).min(axis=None)
- x_max = bk.concat([m.trials[metric_c] for m in models_c]).max(axis=None)
+ x_min = pd.concat([m.trials[metric_c] for m in models_c]).min(axis=None)
+ x_max = pd.concat([m.trials[metric_c] for m in models_c]).max(axis=None)
x = np.linspace(x_min, x_max, 100)
self._get_figure()
diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py
index e3dcf6a68..cd0386a22 100644
--- a/atom/plots/predictionplot.py
+++ b/atom/plots/predictionplot.py
@@ -39,10 +39,10 @@
Bool, ColumnSelector, FloatZeroToOneExc, Int, IntLargerEqualZero,
IntLargerFour, IntLargerZero, Kind, Legend, MetricConstructor,
MetricSelector, ModelsSelector, RowSelector, Sequence, TargetSelector,
- TargetsSelector, XSelector, index_t,
+ TargetsSelector, XConstructor,
)
from atom.utils.utils import (
- Task, bk, check_canvas, check_dependency, check_empty, check_predict_proba,
+ Task, check_canvas, check_dependency, check_empty, check_predict_proba,
crash, divide, get_custom_scorer, has_task, lst, rnd,
)
@@ -832,7 +832,7 @@ def plot_errors(
from atom.models import OrdinaryLeastSquares
model = OrdinaryLeastSquares(goal=self._goal)
- estimator = model._get_est({}).fit(bk.DataFrame(y_true), y_pred)
+ estimator = model._get_est({}).fit(pd.DataFrame(y_true), y_pred)
self._draw_line(
x=(x := np.linspace(y_true.min(), y_true.max(), 100)),
@@ -1116,7 +1116,7 @@ def plot_forecast(
self,
models: ModelsSelector = None,
fh: RowSelector | ForecastingHorizon = "dataset",
- X: XSelector | None = None,
+ X: XConstructor | None = None,
target: TargetSelector = 0,
*,
plot_insample: Bool = False,
@@ -1232,18 +1232,20 @@ def plot_forecast(
for m in models_c:
if X is not None:
- X = m.transform(X)
- elif isinstance(fh, index_t):
- X = m.branch._all.loc[fh]
+ Xt = m.transform(X)
+ elif isinstance(fh, pd.Index):
+ Xt = m.branch._all.loc[fh]
+ else:
+ Xt = X
# Draw predictions and interval
- y_pred = m.predict(fh=fh, X=check_empty(X))
+ y_pred = m.predict(fh=fh, X=check_empty(Xt))
if self.task.is_multioutput:
y_pred = y_pred[target_c]
if not plot_insample:
idx = y_pred.index.intersection(m.branch.train.index)
- y_pred.loc[idx] = np.NaN # type: ignore[index]
+ y_pred.loc[idx] = np.NaN # type: ignore[call-overload]
y_true = m.branch._all.loc[y_pred.index, target_c]
@@ -1271,7 +1273,7 @@ def plot_forecast(
if plot_interval:
try:
- y_interval = m.predict_interval(fh=fh, X=X)
+ y_interval = m.predict_interval(fh=fh, X=Xt)
except (AttributeError, NotImplementedError):
continue # Fails for some models like ES
@@ -1887,7 +1889,7 @@ class is always the positive one.
data = data.sample(500, random_state=self.random_state)
explanation = m._shap.get_explanation(data, target_c)
- shap = bk.DataFrame(explanation.values, columns=m.branch.features)
+ shap = pd.DataFrame(explanation.values, columns=m.branch.features)
parshap[ds] = pd.Series(index=fxs, dtype=float)
for fx in fxs:
@@ -2134,7 +2136,7 @@ def plot_partial_dependence(
axes.append((xaxis, yaxis))
# Compute averaged predictions
- predictions = Parallel(n_jobs=self.n_jobs, backend=self.backend)(
+ predictions = Parallel(n_jobs=self.n_jobs)(
delayed(partial_dependence)(
estimator=m.estimator,
X=m.branch.X_test,
diff --git a/atom/training.py b/atom/training.py
index 6b0fbe8fe..8cd5a97e0 100644
--- a/atom/training.py
+++ b/atom/training.py
@@ -371,6 +371,7 @@ class DirectClassifier(Direct):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -605,6 +606,7 @@ class DirectForecaster(Direct):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -835,6 +837,7 @@ class DirectRegressor(Direct):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -1075,6 +1078,7 @@ class SuccessiveHalvingClassifier(SuccessiveHalving):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -1312,6 +1316,7 @@ class SuccessiveHalvingForecaster(SuccessiveHalving):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -1546,6 +1551,7 @@ class SuccessiveHalvingRegressor(SuccessiveHalving):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -1792,6 +1798,7 @@ class TrainSizingClassifier(TrainSizing):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -2035,6 +2042,7 @@ class TrainSizingForecaster(TrainSizing):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
@@ -2274,6 +2282,7 @@ class TrainSizingRegressor(TrainSizing):
parallelism. Less robust than `loky`.
- "threading": Single-node, thread-based parallelism.
- "ray": Multi-node, process-based parallelism.
+ - "dask": Multi-node, process-based parallelism.
memory: bool, str, Path or Memory, default=False
Enables caching for memory optimization. Read more in the
diff --git a/atom/utils/patches.py b/atom/utils/patches.py
index ad592220c..bc7b2bc2b 100644
--- a/atom/utils/patches.py
+++ b/atom/utils/patches.py
@@ -9,7 +9,6 @@
from collections.abc import Callable
from copy import deepcopy
-from functools import wraps
from typing import Any
from unittest.mock import patch
@@ -22,7 +21,6 @@
from sklearn.ensemble._base import _fit_single_estimator
from sklearn.model_selection._validation import _fit_and_score, _score
from sklearn.utils import Bunch
-from sklearn.utils._set_output import _wrap_method_output
from sklearn.utils.multiclass import check_classification_targets
from sktime.forecasting.compose import EnsembleForecaster as EF
from sktime.forecasting.compose import StackingForecaster as SF
@@ -36,24 +34,6 @@
# Functions ======================================================== >>
-def wrap_method_output(f: Callable, method: str) -> Callable:
- """Wrap sklearn's _wrap_method_output function.
-
- Custom implementation to avoid errors for transformers that allow
- only providing `y`. Is used internally by _SetOutputMixin.
-
- """
-
- @wraps(f)
- def wrapper(self, *args, **kwargs):
- try:
- return _wrap_method_output(f, method)(self, *args, **kwargs)
- except TypeError:
- return f(self, *args, **kwargs)
-
- return wrapper
-
-
def fit_and_score(*args, **kwargs) -> dict[str, Any]:
"""Wrap sklearn's _fit_and_score function.
diff --git a/atom/utils/types.py b/atom/utils/types.py
index f9a674aa3..8a86a3067 100644
--- a/atom/utils/types.py
+++ b/atom/utils/types.py
@@ -7,16 +7,16 @@
from __future__ import annotations
-from collections.abc import Callable, Hashable, Iterable, Iterator
+import os
+from collections.abc import Callable, Hashable, Iterator
+from importlib.util import find_spec
from typing import (
TYPE_CHECKING, Annotated, Any, Literal, NamedTuple, SupportsIndex,
TypeAlias, TypedDict, TypeVar, overload, runtime_checkable,
)
-import modin.pandas as md
import numpy as np
import pandas as pd
-import scipy.sparse as sps
from beartype.door import is_bearable
from beartype.typing import Protocol
from beartype.vale import Is
@@ -25,7 +25,12 @@
if TYPE_CHECKING:
- from atom.utils.utils import ClassMap, Goal
+ from atom.data.dataengines import DataEngine
+ from atom.utils.utils import Goal
+
+
+# Avoid warning about pyarrow timezones not set
+os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
# Classes for type hinting ========================================= >>
@@ -117,6 +122,13 @@ def __repr__(self) -> str:
"""Print representation as dictionary."""
return self._asdict().__repr__()
+ @property
+ def data_engine(self) -> DataEngine:
+ """Return the data engine."""
+ from atom.data import DATA_ENGINES
+
+ return DATA_ENGINES[self.data]()
+
class SPTuple(NamedTuple):
"""Return type of the `sp` parameter."""
@@ -126,6 +138,28 @@ class SPTuple(NamedTuple):
trend_model: SeasonalityModels = "additive"
+@runtime_checkable
+class SparseMatrix(Protocol):
+ """Protocol for sparse matrices.
+
+ Required since scipy doesn't have stubs.
+
+ """
+
+ def __len__(self) -> int: ...
+ def __iter__(self) -> Iterator: ...
+ def _bsr_container(self): ...
+ def _coo_container(self): ...
+ def _csc_container(self): ...
+ def _csr_container(self): ...
+ def _dia_container(self): ...
+ def _dok_container(self): ...
+ def _lil_container(self): ...
+
+ @property
+ def shape(self) -> tuple[int, int]: ...
+
+
@runtime_checkable
class SkScorer(Protocol):
"""Protocol for sklearn's scorers."""
@@ -177,7 +211,6 @@ class Model(Protocol):
"""Protocol for all models."""
_goal: Goal
- _metric: ClassMap
_ht: dict[str, Any]
def predict(self, *args, **kwargs) -> Pandas: ...
@@ -190,11 +223,8 @@ def predict(self, *args, **kwargs) -> Pandas: ...
Int: TypeAlias = int | np.integer
Float: TypeAlias = float | np.floating
Scalar: TypeAlias = Int | Float
-Segment: TypeAlias = range | slice
-Index: TypeAlias = pd.Index | md.Index
-Series: TypeAlias = pd.Series | md.Series
-DataFrame: TypeAlias = pd.DataFrame | md.DataFrame
-Pandas: TypeAlias = Series | DataFrame
+Segment: TypeAlias = slice | range
+Pandas: TypeAlias = pd.Series | pd.DataFrame
# Numerical types
IntLargerZero: TypeAlias = Annotated[Int, Is[lambda x: x > 0]]
@@ -210,23 +240,19 @@ def predict(self, *args, **kwargs) -> Pandas: ...
# Types for X, y and fh
XConstructor: TypeAlias = (
dict[str, Sequence[Any]]
- | Sequence[Sequence[Any]]
- | Iterable[Sequence[Any] | tuple[Hashable, Sequence[Any]] | dict[str, Sequence[Any]]]
+ | Sequence[Sequence[Any] | tuple[Hashable, Sequence[Any]]]
| np.ndarray
- | sps.spmatrix
- | DataFrame
+ | SparseMatrix
+ | pd.Series
+ | pd.DataFrame
)
XSelector: TypeAlias = XConstructor | Callable[..., XConstructor]
-YConstructor: TypeAlias = dict[str, Any] | Sequence[Any] | XConstructor
+YConstructor: TypeAlias = Sequence[Any] | XConstructor
YSelector: TypeAlias = Int | str | YConstructor
FHConstructor: TypeAlias = Int | Sequence[Int] | ForecastingHorizon
-# Return types for transform methods
-TReturn: TypeAlias = np.ndarray | sps.spmatrix | Series | DataFrame
-TReturns: TypeAlias = TReturn | tuple[TReturn, TReturn]
-
# Selection of rows or columns by name or position
-ColumnSelector: TypeAlias = Int | str | Segment | Sequence[Int | str] | DataFrame
+ColumnSelector: TypeAlias = Int | str | Segment | Sequence[Int | str] | pd.DataFrame
RowSelector: TypeAlias = Hashable | Sequence[Hashable] | ColumnSelector
# Assignment of index or stratify parameter
@@ -248,10 +274,21 @@ def predict(self, *args, **kwargs) -> Pandas: ...
# BaseTransformer parameters
NJobs: TypeAlias = Annotated[Int, Is[lambda x: x != 0]]
-EngineDataOptions: TypeAlias = Literal["pandas", "pyarrow", "modin"]
+EngineDataOptions: TypeAlias = Literal[
+ "numpy",
+ "pandas",
+ "pandas-pyarrow",
+ "polars",
+ "polars-lazy",
+ "pyarrow",
+ "modin",
+ "dask",
+ "pyspark",
+ "pyspark-pandas",
+]
EngineEstimatorOptions: TypeAlias = Literal["sklearn", "sklearnex", "cuml"]
Engine: TypeAlias = EngineDataOptions | EngineEstimatorOptions | EngineDict | EngineTuple | None
-Backend: TypeAlias = Literal["loky", "multiprocessing", "threading", "ray"]
+Backend: TypeAlias = Literal["loky", "multiprocessing", "threading", "ray", "dask"]
Warnings: TypeAlias = Literal["default", "error", "ignore", "always", "module", "once"]
Severity: TypeAlias = Literal["debug", "info", "warning", "error", "critical"]
Verbose: TypeAlias = Literal[0, 1, 2]
@@ -299,7 +336,11 @@ def predict(self, *args, **kwargs) -> Pandas: ...
# Allowed values for method selection
PredictionMethods: TypeAlias = Literal[
- "decision_function", "predict", "predict_log_proba", "predict_proba", "score"
+ "decision_function",
+ "predict",
+ "predict_log_proba",
+ "predict_proba",
+ "score",
]
PredictionMethodsTS: TypeAlias = Literal[
"predict",
@@ -331,6 +372,17 @@ def predict(self, *args, **kwargs) -> Pandas: ...
]
# Others
+XDatasets: TypeAlias = Literal[
+ "dataset",
+ "train",
+ "test",
+ "holdout",
+ "X",
+ "X_train",
+ "X_test",
+ "X_holdout",
+]
+YDatasets: TypeAlias = Literal["y", "y_train", "y_test", "y_holdout"]
Seasonality: TypeAlias = IntLargerOne | str | Sequence[IntLargerOne | str] | None
SeasonalityModels: TypeAlias = Literal["additive", "multiplicative"]
FeatureNamesOut: TypeAlias = (
@@ -360,6 +412,71 @@ def predict(self, *args, **kwargs) -> Pandas: ...
| Sequence[IntLargerEqualZero]
)
+# Return types for transform methods
+if TYPE_CHECKING:
+ import dask.dataframe as dd
+ import modin.pandas as md
+ import polars as pl
+ import pyarrow as pa
+ import pyspark.pandas as ps
+ from pyspark.sql import DataFrame as SparkDataFrame
+
+ XReturn: TypeAlias = (
+ np.ndarray
+ | pd.DataFrame
+ | pl.DataFrame
+ | pl.LazyFrame
+ | pa.Table
+ | md.DataFrame
+ | dd.DataFrame
+ | SparkDataFrame
+ )
+ YReturn: TypeAlias = (
+ np.ndarray
+ | pd.Series
+ | pl.Series
+ | pa.Array
+ | md.Series
+ | dd.Series
+ | ps.Series
+ )
+else:
+ XReturn: TypeAlias = Sequence[Sequence[Any]] | np.ndarray | SparseMatrix | pd.DataFrame
+ YReturn: TypeAlias = Sequence[Any] | np.ndarray | pd.Series
+
+ if find_spec("polars"):
+ import polars as pl
+
+ XReturn = XReturn | pl.DataFrame | pl.LazyFrame
+ YReturn = YReturn | pl.Series
+
+ if find_spec("pyarrow"):
+ import pyarrow as pa
+
+ XReturn = XReturn | pa.Table
+ YReturn = YReturn | pa.Array
+
+ if find_spec("modin"):
+ import modin.pandas as md
+
+ XReturn = XReturn | md.DataFrame
+ YReturn = YReturn | md.Series
+
+ if find_spec("dask"):
+ import dask.dataframe as dd
+
+ XReturn = XReturn | dd.DataFrame
+ YReturn = YReturn | dd.Series
+
+ if find_spec("pyspark"):
+ import pyspark.pandas as ps
+ from pyspark.sql import DataFrame as SparkDataFrame
+
+ XReturn = XReturn | SparkDataFrame | ps.DataFrame
+ YReturn = YReturn | SparkDataFrame | ps.Series
+
+ YReturn = YReturn | XReturn
+
# Variable types for isinstance ================================== >>
@@ -370,8 +487,5 @@ def predict(self, *args, **kwargs) -> Pandas: ...
int_t = (int, np.integer)
float_t = (float, np.floating)
segment_t = (slice, range)
-index_t = (pd.Index, md.Index)
-series_t = (pd.Series, md.Series)
-sequence_t = (range, list, tuple, np.ndarray, *index_t, *series_t)
-dataframe_t = (pd.DataFrame, md.DataFrame)
-pandas_t = (*series_t, *dataframe_t)
+sequence_t = (range, list, tuple, np.ndarray, pd.Index, pd.Series)
+pandas_t = (pd.Series, pd.DataFrame)
diff --git a/atom/utils/utils.py b/atom/utils/utils.py
index 364193aa1..10354963c 100644
--- a/atom/utils/utils.py
+++ b/atom/utils/utils.py
@@ -8,11 +8,10 @@
from __future__ import annotations
import functools
-import os
import sys
import warnings
from collections import deque
-from collections.abc import Callable, Hashable, Iterator
+from collections.abc import Callable, Iterator
from contextlib import contextmanager
from copy import copy
from dataclasses import dataclass
@@ -23,26 +22,21 @@
from inspect import Parameter, signature
from itertools import cycle
from types import GeneratorType, MappingProxyType
-from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload
import mlflow
-import modin.pandas as md
import nltk
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import scipy.sparse as sps
-from beartype import beartype
from beartype.door import is_bearable
from IPython.display import display
from matplotlib.colors import to_rgba
-from mlflow.models.signature import infer_signature
-from optuna.study import Study
-from optuna.trial import FrozenTrial
from pandas._libs.missing import NAType
-from pandas._typing import Axes, Dtype, DtypeArg
+from pandas._typing import Axes, Dtype
from pandas.api.types import is_numeric_dtype
-from shap import Explainer, Explanation
+from shap import Explainer
from sklearn.base import BaseEstimator
from sklearn.base import OneToOneFeatureMixin as FMixin
from sklearn.metrics import (
@@ -52,31 +46,34 @@
from sklearn.utils import _print_elapsed_time
from sklearn.utils.validation import _is_fitted
-from atom.utils.constants import __version__
+from atom.utils.constants import CAT_TYPES, __version__
from atom.utils.types import (
- Bool, DataFrame, Estimator, FeatureNamesOut, Float, Index, IndexSelector,
- Int, IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar,
- Scorer, Segment, Sequence, Series, SPTuple, Transformer, TReturn, TReturns,
- Verbose, XConstructor, XSelector, YConstructor, YSelector, dataframe_t,
- int_t, pandas_t, segment_t, sequence_t, series_t,
+ Bool, EngineDataOptions, EngineTuple, Estimator, FeatureNamesOut, Float,
+ IndexSelector, Int, IntLargerEqualZero, MetricFunction, Model, Pandas,
+ Predictor, Scalar, Scorer, Segment, Sequence, SPTuple, Transformer,
+ Verbose, XConstructor, XReturn, YConstructor, YReturn, int_t, segment_t,
+ sequence_t,
)
if TYPE_CHECKING:
+ from optuna.study import Study
+ from optuna.trial import FrozenTrial
+ from shap import Explanation
+
from atom.basemodel import BaseModel
from atom.baserunner import BaseRunner
- from atom.branch import Branch
+ from atom.data import Branch
T = TypeVar("T")
-T_Pandas = TypeVar("T_Pandas", Series, DataFrame)
+T_Pandas = TypeVar("T_Pandas", pd.Series, pd.DataFrame, pd.Series | pd.DataFrame)
T_Transformer = TypeVar("T_Transformer", bound=Transformer)
T_Estimator = TypeVar("T_Estimator", bound=Estimator)
# Classes ========================================================== >>
-
class NotFittedError(ValueError, AttributeError):
"""Exception called when the instance is not yet fitted.
@@ -98,7 +95,7 @@ def infer_task(self, y: Pandas) -> Task:
Parameters
----------
- y: series or dataframe
+ y: pd.Series or pd.DataFrame
Target column(s).
Returns
@@ -108,17 +105,17 @@ def infer_task(self, y: Pandas) -> Task:
"""
if self.value == 1:
- if isinstance(y, series_t):
+ if isinstance(y, pd.Series):
return Task.regression
else:
return Task.multioutput_regression
elif self.value == 2:
- if isinstance(y, series_t):
+ if isinstance(y, pd.Series):
return Task.univariate_forecast
else:
return Task.multivariate_forecast
- if isinstance(y, dataframe_t):
+ if isinstance(y, pd.DataFrame):
if all(y[col].nunique() == 2 for col in y.columns):
return Task.multilabel_classification
else:
@@ -207,10 +204,10 @@ class SeasonalPeriod(IntEnum):
class DataContainer:
"""Stores a branch's data."""
- data: DataFrame # Complete dataset
- train_idx: Index # Indices in the train set
- test_idx: Index # Indices in the test
- n_cols: Int # Number of target columns
+ data: pd.DataFrame # Complete dataset
+ train_idx: pd.Index # Indices in the train set
+ test_idx: pd.Index # Indices in the test
+ n_targets: int # Number of target columns
@dataclass
@@ -245,7 +242,7 @@ class DataConfig:
"""
- index: bool = True
+ index: bool = False
ignore: tuple[str, ...] = ()
sp: SPTuple = SPTuple() # noqa: RUF009
shuffle: Bool = False
@@ -254,20 +251,20 @@ class DataConfig:
test_size: Scalar = 0.2
holdout_size: Scalar | None = None
- def get_stratify_columns(self, df: DataFrame, y: Pandas) -> DataFrame | None:
+ def get_stratify_columns(self, df: pd.DataFrame, y: Pandas) -> pd.DataFrame | None:
"""Get columns to stratify by.
Parameters
----------
- df: dataframe
+ df: pd.DataFrame
Dataset from which to get the columns.
- y: series or dataframe
- Target column.
+ y: pd.Series or pd.DataFrame
+ Target column(s).
Returns
-------
- dataframe or None
+ pd.DataFrame or None
Dataset with subselection of columns. Returns None if
there's no stratification.
@@ -302,26 +299,6 @@ def get_stratify_columns(self, df: DataFrame, y: Pandas) -> DataFrame | None:
return df[inc]
-class PandasModin:
- """Utility class to select the right data engine.
-
- Returns pandas or modin depending on the env variable
- ATOM_DATA_ENGINE, which is set in BaseTransformer.py.
-
- """
-
- def __getattr__(self, item: str) -> Any:
- """Return the backend engine."""
- if os.environ.get("ATOM_DATA_ENGINE") == "modin":
- return getattr(md, item)
- else:
- return getattr(pd, item)
-
-
-# ATOM uses this instance to access the data engine
-bk = PandasModin()
-
-
class CatBMetric:
"""Custom evaluation metric for the CatBoost model.
@@ -653,7 +630,8 @@ def __init__(self, model: BaseModel, n_jobs: Int):
def __call__(self, study: Study, trial: FrozenTrial):
"""Print trial info and store in mlflow experiment."""
try: # Fails when there are no successful trials
- trial_info = self.T.trials.reset_index(names="trial").loc[trial.number]
+ trials = self.T.trials.reset_index(names="trial")
+ trial_info = cast(pd.Series, trials.loc[trial.number]) # Loc returns df or series
except KeyError:
return
@@ -685,7 +663,7 @@ def __call__(self, study: Study, trial: FrozenTrial):
mlflow.sklearn.log_model(
sk_model=estimator,
artifact_path=estimator.__class__.__name__,
- signature=infer_signature(
+ signature=mlflow.models.signature.infer_signature(
model_input=pd.DataFrame(self.T.branch.X),
model_output=estimator.predict(self.T.branch.X.iloc[[0]]),
),
@@ -761,7 +739,7 @@ def __init__(self, name: str, metric: list[str], aesthetics: Aesthetics):
self.y1: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))}
self.y2: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))}
- traces: list[go.Scatter] = []
+ traces = []
colors = cycle(aesthetics.palette)
for met in metric:
color = next(colors)
@@ -943,16 +921,16 @@ def explainer(self) -> Explainer:
Returns
-------
- Explainer
+ shap.Explainer
Get the initialized explainer object.
"""
- # Pass masker as np.array and feature names separately for modin frames
kwargs = {
- "masker": self.branch.X_train.to_numpy(),
+ "masker": self.branch.X_train,
"feature_names": list(self.branch.features),
"seed": self.random_state,
}
+
try: # Fails when model does not fit standard explainers (e.g., ensembles)
return Explainer(self.estimator, **kwargs)
except TypeError:
@@ -961,7 +939,7 @@ def explainer(self) -> Explainer:
def get_explanation(
self,
- df: DataFrame,
+ df: pd.DataFrame,
target: tuple[Int, ...],
) -> Explanation:
"""Get an Explanation object.
@@ -970,7 +948,7 @@ def get_explanation(
Parameters
----------
- df: dataframe
+ df: pd.DataFrame
Data set to look at (subset of the complete dataset).
target: tuple
@@ -1009,10 +987,10 @@ def get_explanation(
) from None
# Remember shap values in the _shap_values attribute
- self._shap_values = bk.concat(
+ self._shap_values = pd.concat(
[
self._shap_values,
- bk.Series(list(self._explanation.values), index=calculate.index),
+ pd.Series(list(self._explanation.values), index=calculate.index),
]
)
@@ -1330,7 +1308,7 @@ def sign(obj: Callable) -> MappingProxyType:
return signature(obj).parameters
-def merge(*args) -> DataFrame:
+def merge(*args) -> pd.DataFrame:
"""Concatenate pandas objects column-wise.
None and empty objects are ignored.
@@ -1342,14 +1320,14 @@ def merge(*args) -> DataFrame:
Returns
-------
- dataframe
+ pd.DataFrame
Concatenated dataframe.
"""
if len(args_c := [x for x in args if x is not None and not x.empty]) == 1:
- return bk.DataFrame(args_c[0])
+ return pd.DataFrame(args_c[0])
else:
- return bk.DataFrame(bk.concat(args_c, axis=1))
+ return pd.DataFrame(pd.concat(args_c, axis=1))
def replace_missing(X: T_Pandas, missing_values: list[Any] | None = None) -> T_Pandas:
@@ -1360,7 +1338,7 @@ def replace_missing(X: T_Pandas, missing_values: list[Any] | None = None) -> T_P
Parameters
----------
- X: series or dataframe
+ X: pd.Series or pd.DataFrame
Data set to replace.
missing_values: list or None, default=None
@@ -1369,7 +1347,7 @@ def replace_missing(X: T_Pandas, missing_values: list[Any] | None = None) -> T_P
Returns
-------
- series or dataframe
+ pd.Series or pd.DataFrame
Data set without missing values.
"""
@@ -1393,42 +1371,94 @@ def get_nan(dtype: Dtype) -> float | NAType:
# Always convert these values
default_values = [None, pd.NA, pd.NaT, np.NaN, np.inf, -np.inf]
- if isinstance(X, series_t):
+ if isinstance(X, pd.DataFrame):
return X.replace(
- to_replace=(missing_values or []) + default_values,
- value=get_nan(X.dtype),
+ to_replace={c: (missing_values or []) + default_values for c in X.columns},
+ value={c: get_nan(d) for c, d in X.dtypes.items()},
)
else:
return X.replace(
- to_replace={k: (missing_values or []) + default_values for k in X},
- value={k: get_nan(X[k].dtype) for k in X},
+ to_replace=(missing_values or []) + default_values,
+ value=get_nan(X.dtype),
)
-def get_cols(elem: Pandas) -> list[Series]:
+def n_cols(obj: YConstructor | None) -> int:
+ """Get the number of columns in a dataset.
+
+ Parameters
+ ----------
+ obj: dict, sequence, dataframe-like or None
+ Dataset to check.
+
+ Returns
+ -------
+ int
+ Number of columns.
+
+ """
+ if hasattr(obj, "shape"):
+ return obj.shape[1] if len(obj.shape) > 1 else 1 # type: ignore[union-attr]
+ elif isinstance(obj, dict):
+ return 2 # Dict always goes to dataframe
+
+ try:
+ if (array := np.asarray(obj)).ndim > 1:
+ return array.shape[1]
+ else:
+ return array.ndim
+ except ValueError:
+ # Fails for inhomogeneous data, return series
+ return 1
+
+
+def get_cols(obj: Pandas) -> list[pd.Series]:
"""Get a list of columns in dataframe / series.
Parameters
----------
- elem: series or dataframe
+ obj: pd.Series or pd.DataFrame
Element to get the columns from.
Returns
-------
- list of series
- Columns in elem.
+ list of pd.Series
+ Columns.
"""
- if isinstance(elem, series_t):
- return [elem]
+ if isinstance(obj, pd.Series):
+ return [obj]
else:
- return [elem[col] for col in elem.columns]
+ return [obj[col] for col in obj.columns]
+
+
+def get_col_names(obj: Any) -> list[str] | None:
+ """Get a list of column names in tabular objects.
+
+ Parameters
+ ----------
+ obj: object
+ Element to get the column names from.
+
+ Returns
+ -------
+ list of str
+ Names of the columns. Returns None when the object passed is
+ no pandas object.
+
+ """
+ if isinstance(obj, pd.DataFrame):
+ return list(obj.columns)
+ elif isinstance(obj, pd.Series):
+ return [str(obj.name)]
+ else:
+ return None
def variable_return(
- X: DataFrame | None,
- y: Series | None,
-) -> DataFrame | Series | tuple[DataFrame, Pandas]:
+ X: XReturn | None,
+ y: YReturn | None,
+) -> XReturn | tuple[XReturn, YReturn]:
"""Return one or two arguments depending on which is None.
This utility is used to make methods return only the provided
@@ -1440,20 +1470,22 @@ def variable_return(
Feature set.
y: series, dataframe or None
- Target column.
+ Target column(s).
Returns
-------
- dataframe, series or tuple
+ series, dataframe or tuple
Data sets that are not None.
"""
- if y is None:
+ if y is None and X is not None:
return X
- elif X is None:
+ elif X is None and y is not None:
return y
- else:
+ elif X is not None and y is not None:
return X, y
+ else:
+ raise ValueError("Both X and y can't be None.")
def get_segment(obj: list[T], segment: Segment) -> list[T]:
@@ -1486,7 +1518,7 @@ def is_sparse(obj: Pandas) -> bool:
Parameters
----------
- obj: series or dataframe
+ obj: pd.Series or pd.DataFrame
Data set to check.
Returns
@@ -1498,25 +1530,27 @@ def is_sparse(obj: Pandas) -> bool:
return any(isinstance(col.dtype, pd.SparseDtype) for col in get_cols(obj))
-def check_empty(obj: Pandas) -> Pandas | None:
+def check_empty(obj: Pandas | None) -> Pandas | None:
"""Check if a pandas object is empty.
Parameters
----------
- obj: series or dataframe
+ obj: pd.Series, pd.DataFrame or None
Pandas object to check.
Returns
-------
- series, dataframe or None
- Same object or None if empty.
+ pd.Series, pd.DataFrame or None
+ Same object or None if empty or obj is None.
"""
- return obj if isinstance(obj, dataframe_t) and not obj.empty else None
+ return obj if isinstance(obj, pd.DataFrame) and not obj.empty else None
def check_dependency(name: str):
- """Raise an error if a package is not installed.
+ """Check an optional dependency.
+
+ Raise an error if the package is not installed.
Parameters
----------
@@ -1524,7 +1558,7 @@ def check_dependency(name: str):
Name of the package to check.
"""
- if not find_spec(name.replace("-", "_")):
+ if not find_spec(name):
raise ModuleNotFoundError(
f"Unable to import the {name} package. Install it using "
f"`pip install {name}` or install all of atom's optional "
@@ -1591,50 +1625,38 @@ def check_predict_proba(models: Model | Sequence[Model], method: str):
)
-def check_scaling(X: Pandas, pipeline: Any | None = None) -> bool:
+def check_scaling(obj: Pandas) -> bool:
"""Check if the data is scaled.
A data set is considered scaled when the mean of the mean of
all columns lies between -0.05 and 0.05 and the mean of the
standard deviation of all columns lies between 0.85 and 1.15.
- Binary columns are excluded from the calculation.
-
- Additionally, if a pipeline is provided and there's a scaler in
- the pipeline, it also returns False.
+ Categorical and binary columns are excluded from the calculation.
Parameters
----------
- X: series or dataframe
+ obj: pd.Series or pd.DataFrame
Data set to check.
- pipeline: Pipeline or None, default=None
- Pipeline in which to check for a scaler (any estimator whose
- name contains the word scaler).
-
Returns
-------
bool
Whether the data set is scaled.
"""
- has_scaler = False
- if pipeline is not None:
- has_scaler = any("scaler" in name.lower() for name in pipeline.named_steps)
-
- df = to_df(X) # Convert to dataframe
- df = df.loc[:, (~df.isin([0, 1])).any(axis=0)] # Remove binary columns
-
- if df.empty: # All columns are binary -> no scaling needed
- return True
+ if isinstance(obj, pd.DataFrame):
+ mean = obj.mean(numeric_only=True).mean()
+ std = obj.std(numeric_only=True).mean()
else:
- mean = df.mean(numeric_only=True).mean()
- std = df.std(numeric_only=True).mean()
- return has_scaler or bool(-0.05 < mean < 0.05 and 0.85 < std < 1.15)
+ mean = obj.mean()
+ std = obj.std()
+
+ return bool(-0.05 < mean < 0.05 and 0.85 < std < 1.15)
@contextmanager
def keep_attrs(estimator: Estimator):
- """Contextmanager to save an estimator's custom attributes.
+ """Temporarily save an estimator's custom attributes.
ATOM's pipeline uses two custom attributes for its transformers:
_train_only, and _cols. Since some transformers reset their
@@ -1654,30 +1676,42 @@ def keep_attrs(estimator: Estimator):
@contextmanager
-def adjust_verbosity(estimator: Estimator, verbose: Verbose | None):
- """Contextmanager to save an estimator's custom attributes.
+def adjust(
+ estimator: Estimator,
+ *,
+ transform: EngineDataOptions | None = None,
+ verbose: Verbose | None = None,
+):
+ """Temporarily adjust output parameters of an estimator.
- ATOM's pipeline uses two custom attributes for its transformers:
- _train_only, and _cols. Since some transformers reset their
- attributes during fit (like those from sktime), we wrap the fit
- method in a contextmanager that saves and restores the attrs.
+ The estimator's data engine and verbosity are temporarily changed
+ to the provided values.
Parameters
----------
estimator: Estimator
Temporarily change the verbosity of this estimator.
+ transform: str or None, default=None
+ Data engine for the estimator. If None, it leaves it to
+ its original engine.
+
verbose: int or None, default=None
- Verbosity level of the transformers in the pipeline. If
- None, it leaves them to their original verbosity.
+ Verbosity level for the estimator. If None, it leaves it to
+ its original verbosity.
"""
try:
+ if transform is not None and hasattr(estimator, "set_output"):
+ output = getattr(estimator, "_engine", EngineTuple())
+ estimator.set_output(transform=transform)
if verbose is not None and hasattr(estimator, "verbose"):
verbosity = estimator.verbose
estimator.verbose = verbose
yield estimator
finally:
+ if transform is not None and hasattr(estimator, "set_output"):
+ estimator._engine = output
if verbose is not None and hasattr(estimator, "verbose"):
estimator.verbose = verbosity
@@ -1704,7 +1738,7 @@ def get_versions(models: ClassMap) -> dict[str, str]:
return versions
-def get_corpus(df: DataFrame) -> str:
+def get_corpus(df: pd.DataFrame) -> str:
"""Get text column from a dataframe.
The text column should be called `corpus` (case-insensitive). Also
@@ -1712,7 +1746,7 @@ def get_corpus(df: DataFrame) -> str:
Parameters
----------
- df: dataframe
+ df: pd.DataFrame
Data set from which to get the corpus.
Returns
@@ -1760,164 +1794,125 @@ def time_to_str(t: Scalar) -> str:
return f"{h:02.0f}h:{m:02.0f}m:{s:02.0f}s"
-def n_cols(data: XSelector | YSelector) -> int:
- """Get the number of columns in a dataset.
-
- Parameters
- ----------
- data: sequence or dataframe-like
- Dataset to check.
-
- Returns
- -------
- int or None
- Number of columns.
-
- """
- if (array := np.array(data, dtype="object")).ndim > 1:
- return array.shape[1]
- else:
- return array.ndim # Can be zero when input is a dict
-
-
-def to_pyarrow(column: Series, *, inverse: bool = False) -> Dtype:
- """Get the pyarrow dtype corresponding to a series.
-
- Parameters
- ----------
- column: series
- Column to get the dtype from. If it already has a pyarrow
- dtype, return the original dtype.
-
- inverse: bool, default=False
- Whether to convert to pyarrow or back from pyarrow.
-
- Returns
- -------
- str
- Name of the converted dtype.
-
- """
- if not inverse and not column.dtype.name.endswith("[pyarrow]"):
- if column.dtype.name == "object":
- return "string[pyarrow]" # pyarrow doesn't support 'object'
- else:
- return f"{column.dtype.name}[pyarrow]"
- elif inverse and column.dtype.name.endswith("[pyarrow]"):
- return column.dtype.name[:-9]
-
- return column.dtype.name
-
-
@overload
def to_df(
data: Literal[None],
index: Axes | None = ...,
columns: Axes | None = ...,
- dtype: DtypeArg | None = ...,
) -> None: ...
@overload
def to_df(
- data: XSelector,
+ data: XConstructor,
index: Axes | None = ...,
columns: Axes | None = ...,
- dtype: DtypeArg | None = ...,
-) -> DataFrame: ...
+) -> pd.DataFrame: ...
def to_df(
- data: XSelector | None,
+ data: XConstructor | None,
index: Axes | None = None,
columns: Axes | None = None,
- dtype: DtypeArg | None = None,
-) -> DataFrame | None:
- """Convert a dataset to a dataframe.
+) -> pd.DataFrame | None:
+ """Convert a dataset to a pandas dataframe.
Parameters
----------
data: dataframe-like or None
- Dataset to convert to a dataframe. If None or already a
- dataframe, return unchanged.
+ Dataset to convert to a dataframe. If None or already a
+ pandas dataframe, return unchanged.
- index: sequence, index or None, default=None
+ index: sequence or None, default=None
Values for the index.
columns: sequence or None, default=None
- Name of the columns. Use None for automatic naming.
-
- dtype: str, dict, np.dtype or None, default=None
- Data types for the output columns. If None, the types are
- inferred from the data.
+ Names of the columns. Use None for automatic naming.
Returns
-------
- dataframe or None
- Dataset as dataframe of a type given by the backend.
+ pd.DataFrame or None
+ Data as dataframe. Returns None if data is None.
"""
if data is not None:
- if not isinstance(data, bk.DataFrame):
- # Assign default column names (dict already has column names)
- if not isinstance(data, dict | Pandas) and columns is None:
+ if isinstance(data, pd.DataFrame):
+ data_c = data.copy()
+ elif hasattr(data, "to_pandas"):
+ data_c = data.to_pandas()
+ elif hasattr(data, "__dataframe__"):
+ # Transform from dataframe interchange protocol
+ data_c = pd.api.interchange.from_dataframe(data.__dataframe__())
+ else:
+ # Assign default column names (dict and series already have names)
+ if columns is None and not isinstance(data, dict | pd.Series):
columns = [f"x{i}" for i in range(n_cols(data))]
- if hasattr(data, "to_pandas") and bk.__name__ == "pandas":
- # Convert cuML to pandas
- data_c = data.to_pandas() # type: ignore[operator]
- elif sps.issparse(data):
- data_c = pd.DataFrame.sparse.from_spmatrix(
- data=data,
+ if sps.issparse(data):
+ data_c = pd.DataFrame.sparse.from_spmatrix(data, index, columns)
+ else:
+ data_c = pd.DataFrame(
+ data=data, # type: ignore[misc, arg-type]
index=index,
columns=columns,
+ copy=True,
)
- else:
- data_c = pd.DataFrame(data, index, columns) # type: ignore[arg-type, misc]
+
+ # If text dataset, change the name of the column to corpus
+ if list(data_c.columns) == ["x0"] and data_c.dtypes[0].name in CAT_TYPES:
+ data_c = data_c.rename(columns={data_c.columns[0]: "corpus"})
else:
- data_c = data
+ # Convert all column names to str
+ data_c.columns = data_c.columns.astype(str)
- if dtype is not None:
- data_c = data_c.astype(dtype)
+ # No duplicate rows nor column names are allowed
+ if data_c.columns.duplicated().any():
+ raise ValueError("Duplicate column names found in X.")
- if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow":
- data_c = data_c.astype({n: to_pyarrow(col) for n, col in data_c.items()})
+ if columns is not None:
+ # Reorder columns to the provided order
+ try:
+ data_c = data_c[list(columns)] # Force order determined by columns
+ except KeyError:
+ raise ValueError(
+ f"The columns are different than seen at fit time. Features "
+ f"{set(data_c.columns) - set(columns)} " # type: ignore[arg-type]
+ "are missing in X."
+ ) from None
return data_c
-
- return data
+ else:
+ return None
@overload
def to_series(
data: Literal[None],
index: Axes | None = ...,
- name: Hashable | None = ...,
- dtype: Dtype | None = ...,
+ name: str | None = ...,
) -> None: ...
@overload
def to_series(
- data: dict[str, Any] | Sequence[Any],
+ data: dict[str, Any] | Sequence[Any] | pd.DataFrame,
index: Axes | None = ...,
- name: Hashable | None = ...,
- dtype: Dtype | None = ...,
-) -> Series: ...
+ name: str | None = ...,
+) -> pd.Series: ...
def to_series(
- data: dict[str, Any] | Sequence[Any] | None,
+ data: dict[str, Any] | Sequence[Any] | pd.DataFrame | None,
index: Axes | None = None,
- name: Hashable | None = None,
- dtype: Dtype | None = None,
-) -> Series | None:
- """Convert a sequence to a series.
+ name: str | None = None,
+) -> pd.Series | None:
+ """Convert a sequence to a pandas series.
Parameters
----------
- data: dict, sequence or None
- Data to convert. If None, return unchanged.
+ data: dict, sequence, pd.DataFrame or None
+ Data to convert. If None or already a pandas series, return
+ unchanged.
index: sequence, index or None, default=None
Values for the index.
@@ -1925,99 +1920,84 @@ def to_series(
name: str or None, default=None
Name of the series.
- dtype: str, np.dtype or None, default=None
- Data type for the output series. If None, the type is
- inferred from the data.
-
Returns
-------
- series or None
- Sequence as series of a type given by the backend.
+ pd.Series or None
+ Data as series. Returns None if data is None.
"""
if data is not None:
- if not isinstance(data, bk.Series):
- if hasattr(data, "to_pandas") and bk.__name__ == "pandas":
- data_c = data.to_pandas() # Convert cuML to pandas
- else:
- # Flatten for arrays with shape (n_samples, 1), sometimes returned by cuML
- data_c = pd.Series( # type: ignore[misc]
- data=np.array(data, dtype="object").ravel().tolist(),
- index=index,
- name=getattr(data, "name", name),
- dtype=dtype, # type: ignore[arg-type]
- )
+ if isinstance(data, pd.Series):
+ data_c = data.copy()
+ elif isinstance(data, pd.DataFrame):
+ data_c = data.iloc[:, 0].copy()
+ elif hasattr(data, "to_pandas"):
+ data_c = data.to_pandas()
else:
- data_c = data
+ try:
+ # Flatten for arrays with shape=(n_samples, 1)
+ array = np.asarray(data).ravel().tolist()
+ except ValueError:
+ # Fails for inhomogeneous data
+ array = data
- if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow":
- data_c = data_c.astype(to_pyarrow(data_c))
+ data_c = pd.Series(array, index=index, name=name or "target", copy=True)
return data_c
-
- return data
+ else:
+ return None
@overload
-def to_pandas(
+def to_tabular(
data: Literal[None],
index: Axes | None = ...,
- columns: Axes | None = ...,
- name: str | None = ...,
- dtype: DtypeArg | None = ...,
+ columns: str | Axes | None = ...,
) -> None: ...
@overload
-def to_pandas(
+def to_tabular(
data: YConstructor,
index: Axes | None = ...,
- columns: Axes | None = ...,
- name: str | None = ...,
- dtype: DtypeArg | None = ...,
+ columns: str | Axes | None = ...,
) -> Pandas: ...
-def to_pandas(
+def to_tabular(
data: YConstructor | None,
index: Axes | None = None,
- columns: Axes | None = None,
- name: str | None = None,
- dtype: DtypeArg | None = None,
+ columns: str | Axes | None = None,
) -> Pandas | None:
- """Convert a sequence or dataset to a dataframe or series object.
+ """Convert to a tabular pandas type.
If the data is one-dimensional, convert to series, else to a
dataframe.
Parameters
----------
- data: dict, sequence, dataframe or None
+ data: dict, sequence, pd.DataFrame or None
Data to convert. If None, return unchanged.
index: sequence, index or None, default=None
Values for the index.
- columns: sequence or None, default=None
+ columns: str, sequence or None, default=None
Name of the columns. Use None for automatic naming.
- name: str or None, default=None
- Name of the series.
-
- dtype: str, dict, np.dtype or None, default=None
- Data type for the output series. If None, the type is
- inferred from the data.
-
Returns
-------
- series, dataframe or None
- Data as a Pandas object.
+ pd.Series, pd.DataFrame or None
+ Data as a pandas object.
"""
- if n_cols(data) == 1:
- return to_series(data, index=index, name=name, dtype=dtype) # type: ignore[misc, arg-type]
+ if (n_targets := n_cols(data)) == 1:
+ return to_series(data, index=index, name=flt(columns)) # type: ignore[misc, arg-type]
else:
- return to_df(data, index=index, columns=columns, dtype=dtype)
+ if columns is None and not hasattr(data, "__dataframe__"):
+ columns = [f"y{i}" for i in range(n_targets)]
+
+ return to_df(data, index=index, columns=columns) # type: ignore[misc, arg-type]
def check_is_fitted(
@@ -2053,26 +2033,6 @@ def check_is_fitted(
Whether the estimator is fitted.
"""
-
- def check_attr(attr: str) -> bool:
- """Return whether an attribute is False or empty.
-
- Parameters
- ----------
- attr: str
- Name of the attribute to check.
-
- Returns
- -------
- bool
- Whether the attribute's value is False or empty.
-
- """
- if isinstance(value := getattr(obj, attr), pandas_t):
- return value.empty
- else:
- return not value
-
if hasattr(obj, "_is_fitted"):
is_fitted = obj._is_fitted
else:
@@ -2182,10 +2142,10 @@ def get_custom_scorer(metric: str | MetricFunction | Scorer) -> Scorer:
# Pipeline functions =============================================== >>
def name_cols(
- array: TReturn,
- original_df: DataFrame,
+ df: pd.DataFrame,
+ original_df: pd.DataFrame,
col_names: list[str],
-) -> list[str]:
+) -> pd.Index:
"""Get the column names after a transformation.
If the number of columns is unchanged, the original
@@ -2194,10 +2154,10 @@ def name_cols(
Parameters
----------
- array: np.ndarray, sps.matrix, series or dataframe
+ df: pd.DataFrame
Transformed dataset.
- original_df: dataframe
+ original_df: pd.DataFrame
Original dataset.
col_names: list of str
@@ -2205,24 +2165,24 @@ def name_cols(
Returns
-------
- list of str
+ pd.Index
Column names.
"""
# If columns were only transformed, return og names
- if array.shape[1] == len(col_names):
- return col_names
+ if df.shape[1] == len(col_names):
+ return pd.Index(col_names)
# If columns were added or removed
temp_cols = []
- for i, col in enumerate(array.T):
+ for i, column in enumerate(get_cols(df)):
# equal_nan=True fails for non-numeric dtypes
- mask = original_df.apply(
+ mask = original_df.apply( # type: ignore[type-var]
lambda c: np.array_equal(
a1=c,
- a2=col,
- equal_nan=is_numeric_dtype(c) and np.issubdtype(col.dtype, np.number),
- ),
+ a2=column,
+ equal_nan=is_numeric_dtype(c) and np.issubdtype(column.dtype.name, np.number),
+ )
)
if any(mask) and mask[mask].index[0] not in temp_cols:
@@ -2239,7 +2199,7 @@ def name_cols(
else:
counter += 1
- return temp_cols
+ return pd.Index(temp_cols)
def get_col_order(
@@ -2287,10 +2247,10 @@ def get_col_order(
def reorder_cols(
transformer: Transformer,
- df: DataFrame,
- original_df: DataFrame,
+ df: pd.DataFrame,
+ original_df: pd.DataFrame,
col_names: list[str],
-) -> DataFrame:
+) -> pd.DataFrame:
"""Reorder the columns to their original order.
This function is necessary in case only a subset of the
@@ -2302,10 +2262,10 @@ def reorder_cols(
transformer: Transformer
Instance that transformed `df`.
- df: dataframe
+ df: pd.DataFrame
Dataset to reorder.
- original_df: dataframe
+ original_df: pd.DataFrame
Original dataset (states the order).
col_names: list of str
@@ -2313,7 +2273,7 @@ def reorder_cols(
Returns
-------
- dataframe
+ pd.DataFrame
Dataset with reordered columns.
"""
@@ -2353,8 +2313,8 @@ def reorder_cols(
def fit_one(
estimator: Estimator,
- X: XConstructor | None = None,
- y: YConstructor | None = None,
+ X: pd.DataFrame | None = None,
+ y: Pandas | None = None,
message: str | None = None,
**fit_params,
) -> Estimator:
@@ -2365,19 +2325,12 @@ def fit_one(
estimator: Estimator
Instance to fit.
- X: dataframe-like or None, default=None
+ X: pd.DataFrame or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored.
+ `X` is ignored.
- y: int, str, dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
-
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ y: pd.Series, pd.DataFrame or None, default=None
+ Target column(s) corresponding to `X`.
message: str or None
Short message. If None, nothing will be printed.
@@ -2391,30 +2344,27 @@ def fit_one(
Fitted estimator.
"""
- Xt = to_df(X, index=getattr(y, "index", None))
- yt = to_pandas(y, index=getattr(Xt, "index", None))
-
with _print_elapsed_time("Pipeline", message):
if hasattr(estimator, "fit"):
- kwargs = {}
- inc = getattr(estimator, "_cols", getattr(Xt, "columns", []))
+ kwargs: dict[str, Pandas] = {}
+ inc = getattr(estimator, "_cols", getattr(X, "columns", []))
if "X" in (params := sign(estimator.fit)):
- if Xt is not None and (cols := [c for c in inc if c in Xt]):
- kwargs["X"] = Xt[cols]
+ if X is not None and (cols := [c for c in inc if c in X]):
+ kwargs["X"] = X[cols]
# X is required but has not been provided
if len(kwargs) == 0:
- if yt is not None and hasattr(estimator, "_cols"):
- kwargs["X"] = to_df(yt)[inc]
+ if y is not None and hasattr(estimator, "_cols"):
+ kwargs["X"] = to_df(y)[inc]
elif params["X"].default != Parameter.empty:
kwargs["X"] = params["X"].default # Fill X with default
- elif Xt is None:
+ elif X is None:
raise ValueError(
"Exception while trying to fit transformer "
f"{estimator.__class__.__name__}. Parameter "
"X is required but has not been provided."
)
- elif Xt.empty:
+ elif X.empty:
raise ValueError(
"Exception while trying to fit transformer "
f"{estimator.__class__.__name__}. Parameter X is "
@@ -2423,8 +2373,8 @@ def fit_one(
"target column, e.g., atom.decompose(columns=-1)."
)
- if "y" in params and yt is not None:
- kwargs["y"] = yt
+ if "y" in params and y is not None:
+ kwargs["y"] = y
# Keep custom attrs since some transformers reset during fit
with keep_attrs(estimator):
@@ -2435,11 +2385,11 @@ def fit_one(
def transform_one(
transformer: Transformer,
- X: XConstructor | None = None,
- y: YConstructor | None = None,
+ X: pd.DataFrame | None = None,
+ y: Pandas | None = None,
method: Literal["transform", "inverse_transform"] = "transform",
**transform_params,
-) -> tuple[DataFrame | None, Pandas | None]:
+) -> tuple[pd.DataFrame | None, Pandas | None]:
"""Transform the data using one estimator.
Parameters
@@ -2447,19 +2397,12 @@ def transform_one(
transformer: Transformer
Instance to fit.
- X: dataframe-like or None, default=None
+ X: pd.DataFrame or None, default=None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored.
-
- y: int, str, dict, sequence, dataframe or None, default=None
- Target column corresponding to `X`.
+ `X` is ignored.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ y: pd.Series, pd.DataFrame or None, default=None
+ Target column(s) corresponding to `X`.
method: str, default="transform"
Method to apply: transform or inverse_transform.
@@ -2469,127 +2412,103 @@ def transform_one(
Returns
-------
- dataframe or None
+ pd.DataFrame or None
Feature set. Returns None if not provided.
- series, dataframe or None
- Target column. Returns None if not provided.
+ pd.Series, pd.DataFrame or None
+ Target column(s). Returns None if not provided.
"""
- def prepare_df(out: TReturn, og: DataFrame) -> DataFrame:
- """Convert to df and set correct column names and order.
-
- If ATOM's data backend="pyarrow", convert the dtypes.
+ def prepare_df(out: XConstructor, og: pd.DataFrame) -> pd.DataFrame:
+ """Convert to df and set the correct column names.
Parameters
----------
- out: np.ndarray, sps.matrix, series or dataframe
+ out: dataframe-like
Data returned by the transformation.
- og: dataframe
+ og: pd.DataFrame
Original dataframe, prior to transformations.
Returns
-------
- dataframe
+ pd.DataFrame
Transformed dataset.
"""
- use_cols = [c for c in inc if c in og.columns]
+ out_c = to_df(out, index=og.index)
- # Convert to pandas and assign proper column names
- if not isinstance(out, dataframe_t):
+ # Assign proper column names
+ use_cols = [c for c in inc if c in og.columns]
+ if not isinstance(out, pd.DataFrame):
if hasattr(transformer, "get_feature_names_out"):
- columns = transformer.get_feature_names_out()
+ out_c.columns = transformer.get_feature_names_out()
else:
- columns = name_cols(out, og, use_cols)
- else:
- columns = out.columns
-
- out = to_df(out, index=og.index, columns=columns)
+ out_c.columns = name_cols(out_c, og, use_cols)
# Reorder columns if only a subset was used
if len(use_cols) != og.shape[1]:
- return reorder_cols(transformer, out, og, use_cols)
+ return reorder_cols(transformer, out_c, og, use_cols)
else:
- return out
-
- Xt = to_df(
- data=X,
- index=getattr(y, "index", None),
- columns=getattr(transformer, "feature_names_in_", None),
- )
- yt = to_pandas(
- y,
- index=getattr(Xt, "index", None),
- columns=getattr(transformer, "target_names_in_", None),
- name=flt(getattr(transformer, "target_names_in_", None)),
- )
+ return out_c
use_y = True
kwargs: dict[str, Any] = {}
- inc = list(getattr(transformer, "_cols", getattr(Xt, "columns", [])))
+ inc = list(getattr(transformer, "_cols", getattr(X, "columns", [])))
if "X" in (params := sign(getattr(transformer, method))):
- if Xt is not None and (cols := [c for c in inc if c in Xt]):
- kwargs["X"] = Xt[cols]
+ if X is not None and (cols := [c for c in inc if c in X]):
+ kwargs["X"] = X[cols]
# X is required but has not been provided
if len(kwargs) == 0:
- if yt is not None and hasattr(transformer, "_cols"):
- kwargs["X"] = to_df(yt)[inc]
+ if y is not None and hasattr(transformer, "_cols"):
+ kwargs["X"] = to_df(y)[inc]
use_y = False
elif params["X"].default != Parameter.empty:
kwargs["X"] = params["X"].default # Fill X with default
else:
- return Xt, yt # If X is needed, skip the transformer
+ return X, y # If X is needed, skip the transformer
if "y" in params:
# We skip `y` when already added to `X`
- if yt is not None and use_y:
- kwargs["y"] = yt
+ if y is not None and use_y:
+ kwargs["y"] = y
elif "X" not in params:
- return Xt, yt # If y is None and no X in transformer, skip the transformer
+ return X, y # If y is None and no X in transformer, skip the transformer
- out: TReturns = getattr(transformer, method)(**kwargs, **transform_params)
+ caller = getattr(transformer, method)
+ out: YConstructor | tuple[XConstructor, YConstructor] = caller(**kwargs, **transform_params)
# Transform can return X, y or both
- if isinstance(out, tuple):
- X_new = prepare_df(out[0], Xt)
- y_new = to_pandas(
- data=out[1],
- index=Xt.index,
- name=getattr(yt, "name", None),
- columns=getattr(yt, "columns", None),
- )
- if isinstance(yt, dataframe_t):
- y_new = prepare_df(y_new, yt)
- elif "X" in params and X is not None and any(c in Xt for c in inc):
+ X_new: pd.DataFrame | None
+ y_new: Pandas | None
+ if isinstance(out, tuple) and X is not None:
+ X_new = prepare_df(out[0], X)
+ y_new = to_tabular(out[1], index=X_new.index)
+ if isinstance(y, pd.DataFrame) and isinstance(y_new, pd.DataFrame):
+ y_new = prepare_df(y_new, y)
+ elif "X" in params and X is not None and any(c in X for c in inc):
# X in -> X out
- X_new = prepare_df(out, Xt)
- y_new = yt if yt is None else yt.set_axis(X_new.index, axis=0)
+ X_new = prepare_df(out, X) # type: ignore[arg-type]
+ y_new = y if y is None else y.set_axis(X_new.index, axis=0)
elif y is not None:
- y_new = to_pandas(
- data=out,
- index=yt.index,
- name=getattr(yt, "name", None),
- columns=getattr(yt, "columns", None),
- )
- X_new = Xt if Xt is None else Xt.set_index(y_new.index)
- if isinstance(yt, dataframe_t):
- y_new = prepare_df(y_new, yt)
+ y_new = to_tabular(out)
+ X_new = X if X is None else X.set_index(y_new.index)
+ if isinstance(y, pd.DataFrame) and isinstance(y_new, pd.DataFrame):
+ y_new = prepare_df(y_new, y)
return X_new, y_new
def fit_transform_one(
transformer: Transformer,
- X: XConstructor | None,
- y: YConstructor | None,
+ X: pd.DataFrame | None,
+ y: Pandas | None,
message: str | None = None,
**fit_params,
-) -> tuple[DataFrame | None, Series | None, Transformer]:
+) -> tuple[pd.DataFrame | None, Pandas | None, Transformer]:
"""Fit and transform the data using one estimator.
Estimators without a `transform` method aren't transformed.
@@ -2599,19 +2518,12 @@ def fit_transform_one(
transformer: Transformer
Instance to fit.
- X: dataframe-like or None
+ X: pd.DataFrame or None
Feature set with shape=(n_samples, n_features). If None,
- X is ignored.
-
- y: int, str, dict, sequence, dataframe or None
- Target column corresponding to `X`.
+ `X` is ignored.
- - If None: y is ignored.
- - If int: Position of the target column in X.
- - If str: Name of the target column in X.
- - If sequence: Target column with shape=(n_samples,) or
- sequence of column names or positions for multioutput tasks.
- - If dataframe: Target columns for multioutput tasks.
+ y: pd.Series, pd.DataFrame or None
+ Target column(s) corresponding to `X`.
message: str or None, default=None
Short message. If None, nothing will be printed.
@@ -2621,20 +2533,20 @@ def fit_transform_one(
Returns
-------
- dataframe or None
+ pd.DataFrame or None
Feature set. Returns None if not provided.
- series or None
- Target column. Returns None if not provided.
+ pd.Series, pd.DataFrame or None
+ Target column(s). Returns None if not provided.
Transformer
Fitted transformer.
"""
fit_one(transformer, X, y, message, **fit_params)
- X, y = transform_one(transformer, X, y)
+ Xt, yt = transform_one(transformer, X, y)
- return X, y, transformer
+ return Xt, yt, transformer
# Decorators ======================================================= >>
@@ -2770,50 +2682,6 @@ def wrapper(*args, **kwargs) -> Any:
return wrapper
-def wrap_transformer_methods(f: Callable) -> Callable:
- """Wrap transformer methods with shared code.
-
- The following operations are always performed:
-
- - Transform the input to pandas types.
- - Add the `feature_names_in_` and `n_features_in_` attributes.
- - Check if the instance is fitted before transforming.
-
- """
-
- @wraps(f)
- @beartype
- def wrapper(
- self: T_Transformer,
- X: XSelector | None = None,
- y: YSelector | None = None,
- **kwargs,
- ) -> T_Transformer | Pandas | tuple[DataFrame, Pandas]:
- if f.__name__ == "fit":
- Xt, yt = self._check_input(X, y)
- self._check_feature_names(Xt, reset=True)
- self._check_n_features(Xt, reset=True)
- return f(self, Xt, yt, **kwargs)
-
- else:
- if "TransformerMixin" not in str(self.fit):
- check_is_fitted(self)
-
- Xt, yt = self._check_input(
- X=X,
- y=y,
- columns=getattr(self, "feature_names_in_", None),
- name=getattr(self, "target_names_in_", None),
- )
-
- if "y" in sign(f):
- return f(self, Xt, yt, **kwargs)
- else:
- return f(self, Xt, **kwargs)
-
- return wrapper
-
-
def make_sklearn(
obj: T_Estimator,
feature_names_out: FeatureNamesOut = "one-to-one",
@@ -2881,10 +2749,10 @@ def wrapper(self, *args, **kwargs):
return wrapper
- if not obj.__module__.startswith(("atom.", "sklearn.", "imblearn.")) and hasattr(obj, "fit"):
- if isinstance(obj, type):
+ if not obj.__module__.startswith(("atom.", "sklearn.", "imblearn.")):
+ if isinstance(obj, type) and hasattr(obj, "fit"):
obj.fit = wrap_fit(obj.fit)
- else:
+ elif hasattr(obj.__class__, "fit"):
obj.fit = wrap_fit(obj.__class__.fit).__get__(obj) # type: ignore[method-assign]
return obj
diff --git a/docs_sources/changelog/v6.x.x.md b/docs_sources/changelog/v6.x.x.md
index c967f1aaa..c681b188d 100644
--- a/docs_sources/changelog/v6.x.x.md
+++ b/docs_sources/changelog/v6.x.x.md
@@ -10,6 +10,7 @@
* Support for [Python 3.11](https://www.python.org/downloads/release/python-3110/) and drop support for [Python 3.8](https://www.python.org/downloads/release/python-380/)
and [Python 3.9](ttps://www.python.org/downloads/release/python-390/).
* New data engines. Read more in the [user guide][data-acceleration].
+* Added the `dask` [parallelization backend][parallel-execution].
* Improved memory optimizations. Read more in the [user guide][memory-considerations].
* Added the `iterative` strategy for [numerical imputation][imputer].
* Added the `hdbscan` strategy to the [Pruner][] class.
diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md
index fe360d19a..7dcaa5c61 100644
--- a/docs_sources/dependencies.md
+++ b/docs_sources/dependencies.md
@@ -26,29 +26,24 @@ packages are necessary for its correct functioning.
* **[beartype](https://beartype.readthedocs.io/en/latest/)** (>=0.16.4)
* **[category-encoders](https://contrib.scikit-learn.org/categorical-encoding/index.html)** (>=2.6.3)
-* **[dagshub](https://github.com/DagsHub/client)** (>=0.3.8)
* **[dill](https://pypi.org/project/dill/)** (>=0.3.6)
+* **[featuretools](https://www.featuretools.com/)** (>=1.28.0)
* **[gplearn](https://gplearn.readthedocs.io/en/stable/index.html)** (>=0.4.2)
* **[imbalanced-learn](https://imbalanced-learn.readthedocs.io/en/stable/api.html)** (>=0.12.0)
* **[ipython](https://ipython.readthedocs.io/en/stable/)** (>=8.11.0)
* **[ipywidgets](https://pypi.org/project/ipywidgets/)** (>=8.1.1)
-* **[featuretools](https://www.featuretools.com/)** (>=1.28.0)
* **[joblib](https://joblib.readthedocs.io/en/latest/)** (>=1.3.1)
* **[matplotlib](https://matplotlib.org/)** (>=3.7.2)
-* **[mlflow](https://mlflow.org/)** (>=2.7.1)
-* **[modin[ray]](https://modin.readthedocs.io/en/stable/)** (>=0.25.0)
+* **[mlflow](https://mlflow.org/)** (>=2.10.2)
* **[nltk](https://www.nltk.org/)** (>=3.8.1)
* **[numpy](https://numpy.org/)** (>=1.23.0)
* **[optuna](https://optuna.org/)** (>=3.4.0)
-* **[pandas[parquet]](https://pandas.pydata.org/)** (>=2.1.2)
-* **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.9.1)
+* **[pandas](https://pandas.pydata.org/)** (>=2.1.2)
* **[plotly](https://plotly.com/python/)** (>=5.18.0)
-* **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0)
* **[scikit-learn](https://scikit-learn.org/stable/)** (>=1.4.0)
-* **[scikit-learn-intelex](https://github.com/intel/scikit-learn-intelex)** (>=2023.2.1)
* **[scipy](https://www.scipy.org/)** (>=1.10.1)
* **[shap](https://github.com/slundberg/shap/)** (>=0.43.0)
-* **[sktime[forecasting]](http://www.sktime.net/en/latest/)** (>=0.24.0)
+* **[sktime[forecasting]](http://www.sktime.net/en/latest/)** (>=0.26.0)
* **[statsmodels](https://www.statsmodels.org/stable/index.html)** (>=0.14.1)
* **[zoofs](https://jaswinder9051998.github.io/zoofs/)** (>=0.1.26)
@@ -61,9 +56,19 @@ additional libraries. You can install all the optional dependencies using
* **[botorch](https://botorch.org/docs/introduction)** (>=0.8.5)
* **[catboost](https://catboost.ai/docs/concepts/about.html)** (>=1.2)
+* **[dagshub](https://github.com/DagsHub/client)** (>=0.3.8)
+* **[dask[distributed]](https://dask.org/)** (>=2024.2.0)
* **[explainerdashboard](https://explainerdashboard.readthedocs.io/en/latest/)** (>=0.4.3)
* **[gradio](https://github.com/gradio-app/gradio)** (>=3.44.4)
* **[lightgbm](https://lightgbm.readthedocs.io/en/latest/)** (>=4.1.0)
+* **[modin[ray]](https://modin.readthedocs.io/en/stable/)** (>=0.25.0)
+* **[polars](https://pola.rs/)** (>=0.20.7)
+* **[pyarrow](https://arrow.apache.org/docs/python/)** (>=15.0.0)
+* **[pyspark](https://github.com/apache/spark/tree/master/python)** (>=3.5.0)
+* **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0)
+* **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.9.1)
+* **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0)
+* **[scikit-learn-intelex](https://github.com/intel/scikit-learn-intelex)** (>=2023.2.1)
* **[schemdraw](https://schemdraw.readthedocs.io/en/latest/index.html)** (>=0.16)
* **[statsforecast](https://github.com/Nixtla/statsforecast/)** (>=1.6.0)
* **[sweetviz](https://github.com/fbdesignpro/sweetviz)** (>=2.3.1)
@@ -86,7 +91,6 @@ running `pdm install --dev` (remember to install [pdm](https://pdm-project.org/l
* **[pandas_stubs](https://pypi.org/project/pandas-stubs/)** (>=2.1.1.230928)
* **[pre-commit](https://pre-commit.com/)** (>=3.5.0)
* **[ruff](https://docs.astral.sh/ruff/)** (>=0.1.7)
-* **[types-requests](https://github.com/python/typeshed)** (>=2.31.0.10)
**Testing**
diff --git a/docs_sources/scripts/autodocs.py b/docs_sources/scripts/autodocs.py
index efddad058..5ade3d5cc 100644
--- a/docs_sources/scripts/autodocs.py
+++ b/docs_sources/scripts/autodocs.py
@@ -85,6 +85,7 @@
votingregressor="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingRegressor.html",
ensembleforecaster="https://www.sktime.net/en/latest/api_reference/auto_generated/sktime.forecasting.compose.EnsembleForecaster.html",
# Data cleaning
+ set_output="https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html",
clustercentroids="https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.ClusterCentroids.html",
onehotencoder="https://contrib.scikit-learn.org/category_encoders/onehot.html",
hashingencoder="https://contrib.scikit-learn.org/category_encoders/hashing.html",
@@ -926,7 +927,7 @@ def types_conversion(dtype: str) -> str:
"Pipeline": "[Pipeline][]",
"collections.abc.Hashable": "str",
"Scalar": "int | float",
- "Pandas": "Series | DataFrame",
+ "Pandas": "Series | pd.DataFrame",
"int | numpy.integer": "int",
"float | numpy.floating": "float",
"Series | modin.pandas.series.Series": "Series",
diff --git a/docs_sources/user_guide/accelerating.md b/docs_sources/user_guide/accelerating.md
index 371252f2d..1860f1dc9 100644
--- a/docs_sources/user_guide/accelerating.md
+++ b/docs_sources/user_guide/accelerating.md
@@ -55,32 +55,48 @@ regardless of the engine parameter.
## Data acceleration
+ATOM is mostly built around [sklearn](https://scikit-learn.org/stable/) (and [sktime](https://www.sktime.net/en/stable/) for [time series][]
+tasks), and both these libraries use numpy as their computation backend. Since
+`atom` relies heavily on column names, it uses pandas (which in turn uses numpy)
+as its data backend. However, for the convenience of the user, it implements
+several data engines, that wraps the data in a different type when called by the
+user. This is very similar to sklearn's [set_output][] behaviour, but ATOM
+extends this to many more data types. For example, selecting the `polars` data
+engine, makes `atom.dataset` return a polars dataframe and `atom.winner.predict(X)`
+return a polars series.
+
The data engine can be specified through the [`engine`][atomclassifier-engine]
-parameter, e.g. `#!python engine="pyarrow"` or
-`#!python engine={"data": "pyarrow", "estimator": "sklearnex"}` to combine it
-with an [estimator engine][estimator acceleration]. ATOM integrates the following
-data engines:
-
-- **pandas**: This is the default data engine. It uses the [`pandas`](https://pandas.pydata.org/docs/index.html)
- library with [`numpy`](https://numpy.org/) as backend.
-- **pyarrow**: This engine also uses [`pandas`](https://pandas.pydata.org/docs/user_guide/pyarrow.html), but with the [`pyarrow`](https://arrow.apache.org/docs/python/index.html)
- backend, instead of `numpy`. PyArrow is a cross-language, platform-independent,
- in-memory data format, that provides an efficient and fast way to serialize and
- deserialize data.
+parameter, e.g. `#!python engine="pyarrow"` or `#!python engine={"data": "pyarrow",
+"estimator": "sklearnex"}` to combine it with an [estimator engine][estimator acceleration].
+ATOM integrates the following data engines:
+
+- **numpy**: Transform the data to a [`numpy`](https://numpy.org/) array.
+- **pandas**: Leave the dataset as a [`pandas`](https://pandas.pydata.org/docs/index.html) object. This is the default
+ engine, that leaves the data unchanged.
+- **pandas-pyarrow**: Transform the data to [`pandas`](https://pandas.pydata.org/docs/user_guide/pyarrow.html) with the [`pyarrow`](https://arrow.apache.org/docs/python/index.html)
+ backend. Read more in pandas' [user guide](https://pandas.pydata.org/docs/user_guide/pyarrow.html).
+- **polars**: The [polars](https://docs.pola.rs/) library is a blazingly fast dataframe library
+ implemented in Rust and based on Apache Arrow. Transforms the data to a polars
+ dataframe or series.
+- **polars-lazy**: This engine is similar to the `polars` engine, but it returns
+ a [pl.LazyFrame](https://docs.pola.rs/py-polars/html/reference/lazyframe/index.html) instead of a [pl.pd.DataFrame](https://docs.pola.rs/py-polars/html/reference/dataframe/index.html).
+- **pyarrow**: PyArrow is a cross-language, platform-independent, in-memory data
+ format, that provides an efficient and fast way to serialize and deserialize data.
+ the data is transformed to a [pa.Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html) or [pa.Array](https://arrow.apache.org/docs/python/generated/pyarrow.Array.html).
- **modin**: The [modin](https://modin.readthedocs.io/en/stable/) library is a multi-threading, drop-in replacement
- for pandas, that uses [Ray](https://www.ray.io/) as backend.
+ for pandas, that uses [Ray](https://www.ray.io/) as backend. Transform the data to a modin dataframe
+ or series.
+- **dask**: The [dask](https://docs.dask.org/en/stable/) library is a powerful Python library for parallel and
+ distributed computing. Transform the data to a [dask dataframe](https://docs.dask.org/en/latest/dataframe.html) or [dask series](https://docs.dask.org/en/stable/generated/dask.dataframe.Series.html).
+- **pyspark**: The [pyspark](https://spark.apache.org/docs/latest/api/python/index.html) library is the Python API for Apache Spark.
+ Transform the data to a [pyspark dataframe](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html) or [pyspark series](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.Column.html).
+- **pyspark-pandas**: Similar to the `pyspark` engine, but it returns pyspark objects
+ with the [pandas API](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html).
!!! note
- Although atom accepts a numpy array or a list of lists as input, it
- converts the data internally to the specified data engine since its API
- requires column names and indices.
-
-!!! warning
- Depending on the data engine, the following limitations apply:
-
- - The `pyarrow` engine doesn't support [sparse datasets][].
- - The [LightGBM][] and [XGBoost][] models don't support the `pyarrow` engine.
- - The `modin` engine is not compatible with [forecast][time-series] tasks.
+ It's important to realize that, within atom, the data is still processed using
+ pandas (with the numpy backend). Only when the data is returned to the user, it
+ is transformed to the selected format.
## Estimator acceleration
@@ -238,16 +254,18 @@ parallelization backends.
mostly useful when the execution bottleneck is a compiled extension that
explicitly releases the GIL (for instance a Cython loop wrapped in a "with nogil"
block or an expensive call to a library such as numpy).
-* **ray:** [Ray](https://www.ray.io/) is an open-source unified compute framework
- that makes it easy to scale AI and Python workloads. Read more about Ray [here](https://docs.ray.io/en/latest/ray-core/walkthrough.html).
- See [here][example-ray-backend] an example use case.
+* **ray:** [Ray](https://www.ray.io/) is an open-source unified compute framework that makes it
+ easy to scale AI and Python workloads. Read more about Ray [here](https://docs.ray.io/en/latest/ray-core/walkthrough.html). See
+ [here][example-ray-backend] an example use case.
+* **dask:** [Dask](https://docs.dask.org/en/stable/) is a flexible parallel computing library for analytics.
+ Read more about Dask [here](https://docs.dask.org/en/stable/10-minutes-to-dask.html).
The parallelization backend is applied in the following cases:
* In every individual estimator that uses parallelization internally.
* To calculate cross-validated results during [hyperparameter tuning][].
-* To train multiple models in parallel (when the trainer's `parallel` parameter is True).
+* To train multiple models in parallel (when [`parallel=True`][directclassifier-parallel]).
* To calculate partial dependencies in [plot_partial_dependence][].
!!! note
diff --git a/docs_sources/user_guide/data_management.md b/docs_sources/user_guide/data_management.md
index 0e000bdb2..f95fb2bbb 100644
--- a/docs_sources/user_guide/data_management.md
+++ b/docs_sources/user_guide/data_management.md
@@ -38,8 +38,8 @@ or together:
* X
* X, y
-Remember to use the `y` parameter to indicate the target column in X when
-using the first option. If not specified, the last column in X is used as
+Remember to use the `y` parameter to indicate the target column in `X` when
+using the first option. If not specified, the last column in `X` is used as
the target. In both these cases, the sizes of the sets are defined using the
`test_size` and `holdout_size` parameters. Note that the splits are made
after the subsample of the dataset with the `n_rows` parameter (when not
diff --git a/docs_sources/user_guide/nomenclature.md b/docs_sources/user_guide/nomenclature.md
index ef758c094..33bb42c2c 100644
--- a/docs_sources/user_guide/nomenclature.md
+++ b/docs_sources/user_guide/nomenclature.md
@@ -35,22 +35,22 @@ the target column.