From dfbb0d92540952a04dd3c38b1e29a3de26bc2f5e Mon Sep 17 00:00:00 2001 From: Mavs Date: Sat, 25 Nov 2023 20:02:43 +0100 Subject: [PATCH] set_output=pandas --- atom/__init__.py | 9 +- atom/api.py | 2 +- atom/atom.py | 17 +- atom/basemodel.py | 20 +- atom/baserunner.py | 29 +- atom/basetrainer.py | 6 +- atom/basetransformer.py | 17 +- atom/branch/branch.py | 24 +- atom/branch/branchmanager.py | 2 +- atom/data_cleaning.py | 605 ++++++++++-------------- atom/ensembles.py | 14 +- atom/feature_engineering.py | 230 ++++----- atom/models/classreg.py | 40 +- atom/models/custom.py | 2 +- atom/models/ensembles.py | 16 +- atom/models/ts.py | 3 +- atom/nlp.py | 121 +++-- atom/pipeline.py | 8 +- atom/plots/basefigure.py | 6 +- atom/plots/baseplot.py | 12 +- atom/plots/dataplot.py | 4 +- atom/plots/hyperparametertuningplot.py | 12 +- atom/plots/predictionplot.py | 4 +- atom/plots/shapplot.py | 3 +- atom/training.py | 8 +- atom/utils/types.py | 83 +++- atom/utils/utils.py | 119 +++-- docs_sources/scripts/autodocs.py | 3 +- docs_sources/user_guide/nomenclature.md | 16 +- tests/conftest.py | 4 +- tests/test_api.py | 2 +- tests/test_atom.py | 11 +- tests/test_baserunner.py | 2 +- tests/test_basetransformer.py | 6 - tests/test_data_cleaning.py | 52 +- tests/test_feature_engineering.py | 2 +- tests/test_pipeline.py | 5 +- 37 files changed, 771 insertions(+), 748 deletions(-) diff --git a/atom/__init__.py b/atom/__init__.py index 8233f46b5..661ec56ef 100644 --- a/atom/__init__.py +++ b/atom/__init__.py @@ -3,9 +3,16 @@ """ Automated Tool for Optimized Modeling (ATOM) Author: Mavs -Description: Import API and version. +Description: Import API and version, and set configuration. """ +import pandas as pd +import sklearn + from atom.api import ATOMClassifier, ATOMForecaster, ATOMModel, ATOMRegressor from atom.utils.constants import __version__ + + +pd.options.mode.copy_on_write = True +sklearn.set_config(transform_output="pandas") diff --git a/atom/api.py b/atom/api.py index 353dddace..5bd6cbde2 100644 --- a/atom/api.py +++ b/atom/api.py @@ -11,9 +11,9 @@ from logging import Logger from pathlib import Path +from typing import TypeVar from beartype import beartype -from beartype.typing import TypeVar from joblib.memory import Memory from sklearn.base import clone diff --git a/atom/atom.py b/atom/atom.py index 7214c2202..12b4e6a2b 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -12,17 +12,18 @@ import os from abc import ABCMeta, abstractmethod from collections import defaultdict +from collections.abc import Callable, Iterator from copy import deepcopy from logging import Logger from pathlib import Path from platform import machine, platform, python_build, python_version from types import MappingProxyType +from typing import Any, Literal, TypeVar import dill as pickle import numpy as np import pandas as pd from beartype import beartype -from beartype.typing import Any, Callable, Iterator, Literal, Sequence, TypeVar from joblib.memory import Memory from pandas._typing import DtypeObj from scipy import stats @@ -34,7 +35,7 @@ from atom.branch import Branch, BranchManager from atom.data_cleaning import ( Balancer, Cleaner, Discretizer, Encoder, Imputer, Normalizer, Pruner, - Scaler, + Scaler, TransformerMixin, ) from atom.feature_engineering import ( FeatureExtractor, FeatureGenerator, FeatureGrouper, FeatureSelector, @@ -55,8 +56,9 @@ FloatZeroToOneInc, Index, IndexSelector, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, MetricConstructor, ModelsConstructor, NItems, NJobs, NormalizerStrats, NumericalStrats, Operators, Pandas, PrunerStrats, - RowSelector, Scalar, ScalerStrats, Series, TargetSelector, Transformer, - TSIndex, VectorizerStarts, Verbose, Warnings, XSelector, YSelector, + RowSelector, Scalar, ScalerStrats, Sequence, Series, TargetSelector, + Transformer, VectorizerStarts, Verbose, Warnings, XSelector, YSelector, + sequence_t, tsindex_t, ) from atom.utils.utils import ( ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, bk, @@ -540,7 +542,7 @@ def eda( if isinstance(rows, str): rows_c = [(self.branch._get_rows(rows), rows)] - elif isinstance(rows, Sequence): + elif isinstance(rows, sequence_t): rows_c = [(self.branch._get_rows(r), r) for r in rows] elif isinstance(rows, dict): rows_c = [(self.branch._get_rows(v), k) for k, v in rows.items()] @@ -937,7 +939,7 @@ def stats(self, _vb: Int = -2, /): for set_ in ("train", "test", "holdout"): if (data := getattr(self, set_)) is not None: self._log(f"{set_.capitalize()} set size: {len(data)}", _vb) - if isinstance(self.branch.train.index, TSIndex): + if isinstance(self.branch.train.index, tsindex_t): self._log(f" --> From: {min(data.index)} To: {max(data.index)}", _vb) self._log("-" * 37, _vb) @@ -1147,6 +1149,9 @@ def _add_transformer( ) transformer_c._cols = inc + # Add custom cloning method to keep internal attrs + transformer_c.__class__.__sklearn_clone__ = TransformerMixin.__sklearn_clone__ + if hasattr(transformer_c, "fit"): if not transformer_c.__module__.startswith("atom"): self._log(f"Fitting {transformer_c.__class__.__name__}...", 1) diff --git a/atom/basemodel.py b/atom/basemodel.py index f2d5903a0..681059fd8 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -17,7 +17,7 @@ from importlib import import_module from logging import Logger from pathlib import Path -from typing import overload +from typing import Any, Literal, overload from unittest.mock import patch import dill as pickle @@ -30,7 +30,6 @@ from beartype.roar import ( BeartypeCallHintParamViolation, BeartypeCallHintReturnViolation, ) -from beartype.typing import Any, Literal from joblib.memory import Memory from joblib.parallel import Parallel, delayed from mlflow.data import from_pandas @@ -70,7 +69,8 @@ HT, Backend, Bool, DataFrame, Engine, FHSelector, Float, FloatZeroToOneExc, Int, IntLargerEqualZero, MetricConstructor, NJobs, Pandas, PredictionMethod, Predictor, RowSelector, Scalar, Scorer, Sequence, Stages, - TargetSelector, Verbose, Warnings, XSelector, YSelector, + TargetSelector, Verbose, Warnings, XSelector, YSelector, dataframe_t, + float_t, int_t, ) from atom.utils.utils import ( ClassMap, DataConfig, Goal, PlotCallback, ShapExplanation, Task, @@ -281,7 +281,7 @@ def __contains__(self, item: str) -> bool: return item in self.dataset def __getitem__(self, item: Int | str | list) -> Pandas: - if isinstance(item, Int): + if isinstance(item, int_t): return self.dataset[self.columns[item]] else: return self.dataset[item] # Get a subset of the dataset @@ -431,7 +431,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: Estimator instance. """ - # Separate the parameters for the estimator from those in sub-estimators + # Separate the params for the estimator from those in sub-estimators base_params, sub_params = {}, {} for name, value in params.items(): if "__" not in name: @@ -818,7 +818,7 @@ def _get_score( else: if threshold and self.task.is_binary and hasattr(self, "predict_proba"): y_true, y_pred = self._get_pred(rows, attr="predict_proba") - if isinstance(y_pred, DataFrame): + if isinstance(y_pred, dataframe_t): # Update every target column with its corresponding threshold for i, value in enumerate(threshold): y_pred.iloc[:, i] = (y_pred.iloc[:, i] > value).astype("int") @@ -974,13 +974,13 @@ def fit_model( # Follow the same stratification strategy as atom cols = self._config.get_stratify_columns(self.og.train, self.og.y_train) - if isinstance(cv := self._ht["cv"], Int): + if isinstance(cv := self._ht["cv"], int_t): if self.task.is_forecast: if cv == 1: splitter = SingleWindowSplitter(range(1, len(self.og.test))) else: splitter = TimeSeriesSplit(n_splits=cv) - elif isinstance(self._ht["cv"], Int): + elif isinstance(self._ht["cv"], int_t): # We use ShuffleSplit instead of K-fold because it # works with n_splits=1 and multioutput stratification if cols is None: @@ -1805,7 +1805,7 @@ def inference(*X) -> Scalar | str | list[Scalar | str]: conv = lambda elem: elem.item() if hasattr(elem, "item") else elem y_pred = self.inverse_transform(y=self.predict([X], verbose=0), verbose=0) - if isinstance(y_pred, DataFrame): + if isinstance(y_pred, dataframe_t): return [conv(elem) for elem in y_pred.iloc[0, :]] else: return conv(y_pred[0]) @@ -2028,7 +2028,7 @@ def evaluate( Scores of the model. """ - if isinstance(threshold, Float): + if isinstance(threshold, float_t): threshold_c = [threshold] * self.branch._data.n_cols # Length=n_targets elif len(threshold) != self.branch._data.n_cols: raise ValueError( diff --git a/atom/baserunner.py b/atom/baserunner.py index 0acb64002..7a8bc92b8 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -12,14 +12,15 @@ import random import re from abc import ABCMeta +from collections.abc import Hashable from copy import deepcopy from functools import cached_property from pathlib import Path +from typing import Any import dill as pickle import pandas as pd from beartype import beartype -from beartype.typing import Any, Hashable, Sequence from sklearn.model_selection import train_test_split from sklearn.utils.class_weight import compute_sample_weight from sklearn.utils.metaestimators import available_if @@ -34,7 +35,7 @@ from atom.utils.types import ( Bool, DataFrame, FloatZeroToOneExc, Int, MetricConstructor, Model, ModelSelector, ModelsSelector, Pandas, RowSelector, Scalar, Segment, - Series, YSelector, + Sequence, Series, YSelector, dataframe_t, int_t, segment_t, sequence_t, ) from atom.utils.utils import ( ClassMap, DataContainer, Task, bk, check_is_fitted, composed, crash, @@ -108,7 +109,7 @@ def __getitem__(self, item: Int | str | list) -> Any: "This instance has no dataset annexed to it. " "Use the run method before calling __getitem__." ) - elif isinstance(item, Int): + elif isinstance(item, int_t): return self.dataset[self.columns[item]] elif isinstance(item, str): if item in self._branches: @@ -288,7 +289,7 @@ def _set_index(self, df: DataFrame, y: Pandas | None) -> DataFrame: pass elif self._config.index is False: df = df.reset_index(drop=True) - elif isinstance(self._config.index, Int): + elif isinstance(self._config.index, int_t): if -df.shape[1] <= self._config.index <= df.shape[1]: df = df.set_index(df.columns[int(self._config.index)], drop=True) else: @@ -414,7 +415,7 @@ def _no_data_sets( ) data = _subsample(data) - if isinstance(self._config.index, Sequence): + if isinstance(self._config.index, sequence_t): if len(self._config.index) != len(data): raise IndexError( "Invalid value for the index parameter. Length of " @@ -485,7 +486,7 @@ def _no_data_sets( except ValueError as ex: # Clarify common error with stratification for multioutput tasks - if "least populated class" in str(ex) and isinstance(y, DataFrame): + if "least populated class" in str(ex) and isinstance(y, dataframe_t): raise ValueError( "Stratification for multioutput tasks is applied over all target " "columns, which results in a least populated class that has only " @@ -571,7 +572,7 @@ def _has_data_sets( ) # If the index is a sequence, assign it before shuffling - if isinstance(self._config.index, Sequence): + if isinstance(self._config.index, sequence_t): len_data = len(train) + len(test) if holdout is not None: len_data += len(holdout) @@ -604,7 +605,7 @@ def _has_data_sets( # Process input arrays ===================================== >> if len(arrays) == 0: - if self._goal.name == "forecast" and not isinstance(y, Int | str): + if self._goal.name == "forecast" and not isinstance(y, (*int_t, str)): # arrays=() and y=y for forecasting sets = _no_data_sets(*self._check_input(y=y)) elif not self.branch._container: @@ -625,7 +626,7 @@ def _has_data_sets( X_train, y_train = self._check_input(arrays[0][0], arrays[0][1]) X_test, y_test = self._check_input(arrays[1][0], arrays[1][1]) sets = _has_data_sets(X_train, y_train, X_test, y_test) - elif isinstance(arrays[1], Int | str) or n_cols(arrays[1]) == 1: + elif isinstance(arrays[1], (*int_t, str)) or n_cols(arrays[1]) == 1: if not self._goal.name == "forecast": # arrays=(X, y) sets = _no_data_sets(*self._check_input(arrays[0], arrays[1])) @@ -729,11 +730,11 @@ def _get_models( exc: list[Model] = [] if models is None: inc = self._models.values() - elif isinstance(models, Segment): + elif isinstance(models, segment_t): inc = get_segment(self._models, models) else: for model in lst(models): - if isinstance(model, Int): + if isinstance(model, int_t): try: inc.append(self._models[model]) except KeyError: @@ -788,7 +789,7 @@ def _get_models( return list(dict.fromkeys(inc)) # Avoid duplicates - def _delete_models(self, models: str | Sequence): + def _delete_models(self, models: str | Model | Sequence[str | Model]): """Delete models. Remove models from the instance. All attributes are deleted @@ -797,7 +798,7 @@ def _delete_models(self, models: str | Sequence): Parameters ---------- - models: str or sequence + models: str, Model or sequence Model(s) to delete. """ @@ -1239,7 +1240,7 @@ def stacking( f"{model.fullname} can not perform {self.task} tasks." ) - kwargs["final_estimator"] = model._get_est() + kwargs["final_estimator"] = model._get_est({}) self._models.append(Stacking(models=models_c, name=name, **kw_model, **kwargs)) diff --git a/atom/basetrainer.py b/atom/basetrainer.py index a574d3259..70cee410e 100644 --- a/atom/basetrainer.py +++ b/atom/basetrainer.py @@ -12,12 +12,12 @@ import traceback from abc import ABCMeta from datetime import datetime as dt +from typing import Any import joblib import mlflow import numpy as np import ray -from beartype.typing import Any from joblib import Parallel, delayed from optuna import Study, create_study @@ -26,7 +26,7 @@ from atom.data_cleaning import BaseTransformer from atom.models import MODELS, CustomModel from atom.plots import RunnerPlot -from atom.utils.types import Model, Sequence +from atom.utils.types import Model, sequence_t from atom.utils.utils import ( ClassMap, DataConfig, Goal, Task, check_dependency, get_custom_scorer, lst, sign, time_to_str, @@ -104,7 +104,7 @@ def _check_param(self, param: str, value: Any) -> dict: Parameter with model names as keys. """ - if isinstance(value, Sequence): + if isinstance(value, sequence_t): if len(value) != len(self._models): raise ValueError( f"Invalid value for the {param} parameter. The length " diff --git a/atom/basetransformer.py b/atom/basetransformer.py index c688a2927..778fd74eb 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -13,6 +13,7 @@ import random import tempfile import warnings +from collections.abc import Hashable from copy import deepcopy from datetime import datetime as dt from importlib import import_module @@ -20,7 +21,7 @@ from logging import DEBUG, FileHandler, Formatter, Logger, getLogger from multiprocessing import cpu_count from pathlib import Path -from typing import overload +from typing import Literal, TypeVar, overload import dagshub import mlflow @@ -28,7 +29,6 @@ import ray import requests from beartype import beartype -from beartype.typing import Hashable, Literal, Sequence, TypeVar from dagshub.auth.token_auth import HTTPBearerAuth from joblib.memory import Memory from pandas._typing import Axes @@ -37,7 +37,8 @@ from atom.utils.types import ( Backend, Bool, DataFrame, Engine, Estimator, Int, IntLargerEqualZero, - Pandas, Seq1dim, Severity, Verbose, Warnings, XSelector, YSelector, + Pandas, Sequence, Severity, Verbose, Warnings, XSelector, YSelector, + bool_t, dataframe_t, int_t, sequence_t, ) from atom.utils.utils import crash, flt, n_cols, sign, to_df, to_pandas @@ -208,7 +209,7 @@ def warnings(self) -> Warnings: @warnings.setter @beartype def warnings(self, value: Bool | Warnings): - if isinstance(value, Bool): + if isinstance(value, bool_t): self._warnings: Warnings = "once" if value else "ignore" else: self._warnings = value @@ -501,7 +502,7 @@ def _check_input( ) # Prepare target column - if isinstance(y, dict | Seq1dim | DataFrame): + if isinstance(y, (dict, *sequence_t, *dataframe_t)): if isinstance(y, dict): yt = to_df(deepcopy(y), index=getattr(Xt, "index", None)) if n_cols(yt) == 1: @@ -515,7 +516,7 @@ def _check_input( for col in y: if col in Xt.columns: targets.append(col) - elif isinstance(col, Int): + elif isinstance(col, int_t): if -Xt.shape[1] <= col < Xt.shape[1]: targets.append(Xt.columns[int(col)]) else: @@ -540,7 +541,7 @@ def _check_input( data=deepcopy(yt), index=getattr(Xt, "index", None), name=flt(name) if name is not None else "target", - columns=name if isinstance(name, Sequence) else default_cols, + columns=name if isinstance(name, sequence_t) else default_cols, ) # Check X and y have the same indices @@ -557,7 +558,7 @@ def _check_input( else: raise ValueError("X can't be None when y is a string.") - elif isinstance(y, Int): + elif isinstance(y, int_t): if Xt is None: raise ValueError("X can't be None when y is an int.") diff --git a/atom/branch/branch.py b/atom/branch/branch.py index 2d9e7af0f..d411bc5c0 100644 --- a/atom/branch/branch.py +++ b/atom/branch/branch.py @@ -10,23 +10,23 @@ from __future__ import annotations import re +from collections.abc import Hashable from functools import cached_property from pathlib import Path -from typing import overload +from typing import Literal, overload from warnings import filterwarnings import dill as pickle from beartype import beartype from beartype.roar import BeartypeDecorHintPep585DeprecationWarning -from beartype.typing import Hashable, Literal, Sequence from joblib.memory import Memory from sklearn.utils.validation import check_memory from atom.pipeline import Pipeline from atom.utils.types import ( Bool, ColumnSelector, DataFrame, Index, Int, IntLargerEqualZero, Pandas, - RowSelector, Scalar, Segment, Series, TargetSelector, TargetsSelector, - XSelector, YSelector, + RowSelector, Scalar, Sequence, TargetSelector, TargetsSelector, XSelector, + YSelector, dataframe_t, int_t, segment_t, series_t, ) from atom.utils.utils import ( DataContainer, bk, flt, get_cols, lst, merge, to_pandas, @@ -247,7 +247,7 @@ def counter(name: str, dim: str) -> str | None: ) if under_name: # Check for equal columns - if isinstance(obj, Series): + if isinstance(obj, series_t): if obj.name != under.name: raise ValueError( f"{name} and {under_name} must have the " @@ -508,15 +508,15 @@ def _get_rows( inc: list[Hashable] = [] exc: list[Hashable] = [] - if isinstance(rows, DataFrame): + if isinstance(rows, dataframe_t): inc.extend(rows.index) - elif isinstance(rows, Segment): + elif isinstance(rows, segment_t): inc.extend(indices[rows]) else: for row in lst(rows): if row in indices: inc.append(row) - elif isinstance(row, Int): + elif isinstance(row, int_t): if -len(indices) <= row < len(indices): inc.append(indices[int(row)]) else: @@ -604,13 +604,13 @@ def _get_columns( return list(df.select_dtypes(include=["number"]).columns) else: return list(df.columns) - elif isinstance(columns, DataFrame): + elif isinstance(columns, dataframe_t): inc.extend(list(columns.columns)) - elif isinstance(columns, Segment): + elif isinstance(columns, segment_t): inc.extend(list(df.columns[columns])) else: for col in lst(columns): - if isinstance(col, Int): + if isinstance(col, int_t): if -df.shape[1] <= col < df.shape[1]: inc.append(df.columns[int(col)]) else: @@ -766,7 +766,7 @@ def get_class( if only_columns and not isinstance(target, tuple): return get_column(target) elif isinstance(target, tuple): - if not isinstance(self.y, DataFrame): + if not isinstance(self.y, dataframe_t): raise ValueError( f"Invalid value for the target parameter, got {target}. " "A tuple is only accepted for multioutput tasks." diff --git a/atom/branch/branchmanager.py b/atom/branch/branchmanager.py index 36972bd62..89de18c61 100644 --- a/atom/branch/branchmanager.py +++ b/atom/branch/branchmanager.py @@ -10,10 +10,10 @@ from __future__ import annotations import shutil +from collections.abc import Iterator from copy import copy, deepcopy from beartype import beartype -from beartype.typing import Iterator from joblib.memory import Memory from sklearn.utils.validation import check_memory diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index bbf520f31..94c32193d 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -11,13 +11,14 @@ import re from collections import defaultdict +from collections.abc import Hashable from logging import Logger from pathlib import Path +from typing import Any, Literal import numpy as np import pandas as pd from beartype import beartype -from beartype.typing import Any, Hashable, Literal, Sequence from category_encoders import ( BackwardDifferenceEncoder, BaseNEncoder, BinaryEncoder, CatBoostEncoder, HelmertEncoder, JamesSteinEncoder, MEstimateEncoder, OneHotEncoder, @@ -35,8 +36,7 @@ TomekLinks, ) from scipy.stats import zscore -from sklearn.base import BaseEstimator -from sklearn.compose import ColumnTransformer +from sklearn.base import BaseEstimator, _clone_parametrized from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer, KNNImputer from typing_extensions import Self @@ -48,12 +48,13 @@ Bins, Bool, CategoricalStrats, DataFrame, DiscretizerStrats, Engine, Estimator, FloatLargerZero, IntLargerEqualZero, IntLargerTwo, NJobs, NormalizerStrats, NumericalStrats, Pandas, PrunerStrats, Scalar, - ScalerStrats, Seq1dim, Series, Transformer, Verbose, XSelector, YSelector, + ScalerStrats, Sequence, Series, Transformer, Verbose, XSelector, YSelector, + dataframe_t, sequence_t, series_t, ) from atom.utils.utils import ( bk, check_is_fitted, composed, crash, get_cols, it, lst, merge, method_to_log, n_cols, replace_missing, sign, to_df, to_series, - variable_return, + variable_return, wrap_methods, ) @@ -61,15 +62,36 @@ class TransformerMixin(BaseEstimator, BaseTransformer): """Mixin class for all transformers in ATOM. - Different from sklearn, since it accounts for the transformation - of y and a possible absence of the fit method. + Different from sklearn in the following ways: + + - Accounts for the transformation of y. + - Always add a fit method. + - Wraps the fit method with a data check. + - Wraps transforming methods with fit and data check. + - Maintains internal attributes when cloned. """ + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + + for k in ("fit", "transform", "inverse_transform"): + setattr(cls, k, wrap_methods(getattr(cls, k))) + + def __sklearn_clone__(self): + """Custom implementation to attach internal attributes.""" + cloned = _clone_parametrized(self) + + for attr in ("_cols", "_train_only"): + if hasattr(self, attr): + setattr(cloned, attr, getattr(self, attr)) + + return cloned + def fit( self, - X: XSelector | None = None, - y: YSelector | None = None, + X: DataFrame | None = None, + y: Pandas | None = None, **fit_params, ) -> Self: """Does nothing. @@ -82,7 +104,7 @@ def fit( Feature set with shape=(n_samples, n_features). If None, X is ignored. - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X. - If None: y is ignored. @@ -104,10 +126,6 @@ def fit( Estimator instance. """ - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - self._log(f"Fitting {self.__class__.__name__}...", 1) return self @@ -127,7 +145,7 @@ def fit_transform( Feature set with shape=(n_samples, n_features). If None, X is ignored. - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X. - If None: y is ignored. @@ -157,8 +175,8 @@ def fit_transform( @composed(crash, method_to_log) def inverse_transform( self, - X: XSelector | None = None, - y: YSelector | None = None, + X: DataFrame | None = None, + y: Pandas | None = None, ) -> Pandas | tuple[DataFrame, Pandas]: """Does nothing. @@ -171,7 +189,7 @@ def inverse_transform( Feature set with shape=(n_samples, n_features). If None, X is ignored. - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X. - If None: y is ignored. @@ -187,10 +205,10 @@ def inverse_transform( Returns ------- dataframe - Transformed feature set. Only returned if provided. + Feature set. Only returned if provided. series or dataframe - Transformed target column. Only returned if provided. + Target column. Only returned if provided. """ return variable_return(X, y) @@ -329,7 +347,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: XSelector, y: YSelector = -1) -> Self: + def fit(self, X: DataFrame, y: Pandas = -1) -> Self: """Fit to data. Parameters @@ -355,14 +373,10 @@ def fit(self, X: XSelector, y: YSelector = -1) -> Self: Estimator instance. """ - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - - if isinstance(yt, DataFrame): + if isinstance(y, dataframe_t): raise ValueError("The Balancer class does not support multioutput tasks.") else: - self.target_names_in_ = np.array([yt.name]) + self.target_names_in_ = np.array([y.name]) strategies = dict( # clustercentroids=ClusterCentroids, # Has no sample_indices_ @@ -407,14 +421,14 @@ def fit(self, X: XSelector, y: YSelector = -1) -> Self: # Create dict of class counts in y if not hasattr(self, "mapping_"): - self.mapping_ = {str(v): v for v in yt.sort_values().unique()} + self.mapping_ = {str(v): v for v in y.sort_values().unique()} self._counts = {} for key, value in self.mapping_.items(): - self._counts[key] = np.sum(yt == value) + self._counts[key] = np.sum(y == value) # Add n_jobs or random_state if its one of the estimator's parameters - self._estimator = self._inherit(estimator).fit(Xt, yt) + self._estimator = self._inherit(estimator).fit(X, y) # Add the estimator as attribute to the instance setattr(self, f"{estimator.__class__.__name__.lower()}_", self._estimator) @@ -422,7 +436,7 @@ def fit(self, X: XSelector, y: YSelector = -1) -> Self: return self @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector = -1) -> tuple[DataFrame, Series]: + def transform(self, X: DataFrame, y: Pandas = -1) -> tuple[DataFrame, Series]: """Balance the data. Parameters @@ -447,61 +461,53 @@ def transform(self, X: XSelector, y: YSelector = -1) -> tuple[DataFrame, Series] """ - def log_changes(yt): + def log_changes(y): """Print the changes per target class.""" for key, value in self.mapping_.items(): - diff = self._counts[key] - np.sum(yt == value) + diff = self._counts[key] - np.sum(y == value) if diff > 0: self._log(f" --> Removing {diff} samples from class {key}.", 2) elif diff < 0: self._log(f" --> Adding {-diff} samples to class {key}.", 2) - check_is_fitted(self) - Xt, yt = self._check_input( - X=X, - y=y, - columns=getattr(self, "feature_names_in_", None), - name=getattr(self, "target_names_in_", None), - ) - if "over_sampling" in self._estimator.__module__: self._log(f"Oversampling with {self._estimator.__class__.__name__}...", 1) - index = Xt.index # Save indices for later reassignment - Xt, yt = self._estimator.fit_resample(Xt, yt) + index = X.index # Save indices for later reassignment + X, y = self._estimator.fit_resample(X, y) # Create indices for the new samples n_idx: list[int | str] if index.dtype.kind in "ifu": - n_idx = list(range(max(index) + 1, max(index) + len(Xt) - len(index) + 1)) + n_idx = list(range(max(index) + 1, max(index) + len(X) - len(index) + 1)) else: n_idx = [ f"{self._estimator.__class__.__name__.lower()}_{i}" - for i in range(1, len(Xt) - len(index) + 1) + for i in range(1, len(X) - len(index) + 1) ] # Assign the old + new indices - Xt.index = list(index) + list(n_idx) - yt.index = list(index) + list(n_idx) + X.index = list(index) + list(n_idx) + y.index = list(index) + list(n_idx) - log_changes(yt) + log_changes(y) elif "under_sampling" in self._estimator.__module__: self._log(f"Undersampling with {self._estimator.__class__.__name__}...", 1) - self._estimator.fit_resample(Xt, yt) + self._estimator.fit_resample(X, y) # Select chosen rows (imblearn doesn't return them in order) samples = sorted(self._estimator.sample_indices_) - Xt, yt = Xt.iloc[samples], yt.iloc[samples] # type: ignore[call-overload] + X, y = X.iloc[samples], y.iloc[samples] # type: ignore[call-overload] - log_changes(yt) + log_changes(y) elif "combine" in self._estimator.__module__: self._log(f"Balancing with {self._estimator.__class__.__name__}...", 1) - index = Xt.index - X_new, y_new = self._estimator.fit_resample(Xt, yt) + index = X.index + X_new, y_new = self._estimator.fit_resample(X, y) # Select rows kept by the undersampler if self._estimator.__class__.__name__ == "SMOTEENN": @@ -510,16 +516,16 @@ def log_changes(yt): samples = sorted(self._estimator.tomek_.sample_indices_) # Select the remaining samples from the old dataframe - o_samples = [s for s in samples if s < len(Xt)] - Xt, yt = Xt.iloc[o_samples], yt.iloc[o_samples] # type: ignore[call-overload] + o_samples = [s for s in samples if s < len(X)] + X, y = X.iloc[o_samples], y.iloc[o_samples] # type: ignore[call-overload] # Create indices for the new samples if index.dtype.kind in "ifu": - n_idx = list(range(max(index) + 1, max(index) + len(X_new) - len(Xt) + 1)) + n_idx = list(range(max(index) + 1, max(index) + len(X_new) - len(X) + 1)) else: n_idx = [ f"{self._estimator.__class__.__name__.lower()}_{i}" - for i in range(1, len(X_new) - len(Xt) + 1) + for i in range(1, len(X_new) - len(X) + 1) ] # Select the new samples and assign the new indices @@ -539,9 +545,9 @@ def log_changes(yt): self._log(f" --> Removing {diff} samples from class: {key}.", 2) # Add the new samples to the old dataframe - Xt, yt = bk.concat([Xt, X_new]), bk.concat([yt, y_new]) + X, y = bk.concat([X, X_new]), bk.concat([y, y_new]) - return Xt, yt + return X, y @beartype @@ -715,7 +721,7 @@ def __init__( self.encode_target = encode_target @composed(crash, method_to_log) - def fit(self, X: XSelector | None = None, y: YSelector | None = None) -> Self: + def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -742,10 +748,6 @@ def fit(self, X: XSelector | None = None, y: YSelector | None = None) -> Self: Estimator instance. """ - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - self.mapping_: dict[str, Any] = {} self._estimators = {} @@ -754,24 +756,24 @@ def fit(self, X: XSelector | None = None, y: YSelector | None = None) -> Self: self._log("Fitting Cleaner...", 1) - if yt is not None: - if isinstance(yt, Series): - self.target_names_in_ = np.array([yt.name]) + if y is not None: + if isinstance(y, series_t): + self.target_names_in_ = np.array([y.name]) else: - self.target_names_in_ = yt.columns.values + self.target_names_in_ = y.columns.values if self.drop_chars: - if isinstance(y, Series): - yt.name = re.sub(self.drop_chars, "", str(yt.name)) + if isinstance(y, series_t): + y.name = re.sub(self.drop_chars, "", str(y.name)) else: - yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) + y = y.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) if self.drop_missing_target: - yt = replace_missing(yt, self.missing_).dropna(axis=0) + y = replace_missing(y, self.missing_).dropna(axis=0) if self.encode_target: - for col in get_cols(yt): - if isinstance(col.iloc[0], Seq1dim): # Multilabel + for col in get_cols(y): + if isinstance(col.iloc[0], sequence_t): # Multilabel MultiLabelBinarizer = self._get_est_class( name="MultiLabelBinarizer", module="preprocessing", @@ -789,8 +791,8 @@ def fit(self, X: XSelector | None = None, y: YSelector | None = None) -> Self: @composed(crash, method_to_log) def transform( self, - X: XSelector | None = None, - y: YSelector | None = None, + X: DataFrame | None = None, + y: Pandas | None = None, ) -> Pandas | tuple[DataFrame, Pandas]: """Apply the data cleaning steps to the data. @@ -821,21 +823,13 @@ def transform( Transformed target column. Only returned if provided. """ - check_is_fitted(self) - Xt, yt = self._check_input( - X=X, - y=y, - columns=getattr(self, "feature_names_in_", None), - name=getattr(self, "target_names_in_", None), - ) - self._log("Cleaning the data...", 1) - if Xt is not None: + if X is not None: # Unify all missing values - Xt = replace_missing(Xt, self.missing_) + X = replace_missing(X, self.missing_) - for name, column in Xt.items(): + for name, column in X.items(): dtype = column.dtype.name # Drop features with an invalid data type @@ -844,82 +838,82 @@ def transform( f" --> Dropping feature {name} for " f"having a prohibited type: {dtype}.", 2 ) - Xt = Xt.drop(columns=name) + X = X.drop(columns=name) continue elif dtype in CAT_TYPES: if self.strip_categorical: # Strip strings from blank spaces - Xt[name] = column.apply( + X[name] = column.apply( lambda val: val.strip() if isinstance(val, str) else val ) # Drop prohibited chars from column names if self.drop_chars: - Xt = Xt.rename(columns=lambda x: re.sub(self.drop_chars, "", str(x))) + X = X.rename(columns=lambda x: re.sub(self.drop_chars, "", str(x))) # Drop duplicate samples if self.drop_duplicates: - Xt = Xt.drop_duplicates(ignore_index=True) + X = X.drop_duplicates(ignore_index=True) if self.convert_dtypes: - Xt = Xt.convert_dtypes() + X = X.convert_dtypes() - if yt is not None: + if y is not None: if self.drop_chars: - if isinstance(yt, Series): - yt.name = re.sub(self.drop_chars, "", str(yt.name)) + if isinstance(y, series_t): + y.name = re.sub(self.drop_chars, "", str(y.name)) else: - yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) + y = y.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) # Delete samples with missing values in target if self.drop_missing_target: - length = len(yt) # Save original length to count deleted rows later - yt = replace_missing(yt, self.missing_).dropna() + length = len(y) # Save original length to count deleted rows later + y = replace_missing(y, self.missing_).dropna() - if Xt is not None: - Xt = Xt[Xt.index.isin(yt.index)] # Select only indices that remain + if X is not None: + X = X[X.index.isin(y.index)] # Select only indices that remain - if (d := length - len(yt)) > 0: + if (d := length - len(y)) > 0: self._log(f" --> Dropping {d} rows with missing values in target.", 2) if self.encode_target and self._estimators: - y_trans = yt.__class__(dtype="object") - for col in get_cols(yt): + yt = y.__class__(dtype="object") + for col in get_cols(y): if est := self._estimators.get(col.name): if n_cols(out := est.transform(col)) == 1: self._log(f" --> Label-encoding column {col.name}.", 2) - out = to_series(out, yt.index, col.name) + out = to_series(out, y.index, col.name) else: self._log(f" --> Label-binarizing column {col.name}.", 2) out = to_df( data=out, - index=yt.index, + index=y.index, columns=[f"{col.name}_{c}" for c in est.classes_], ) # Replace target with encoded column(s) - if isinstance(yt, Series): - y_trans = out + if isinstance(y, series_t): + yt = out else: - y_trans = merge(y_trans, out) + yt = merge(yt, out) else: # Add unchanged column - y_trans = merge(y_trans, col) + yt = merge(yt, col) - yt = y_trans + y = yt if self.convert_dtypes: - yt = yt.convert_dtypes() + y = y.convert_dtypes() - return variable_return(Xt, yt) + return variable_return(X, y) @composed(crash, method_to_log) def inverse_transform( self, - X: XSelector | None = None, - y: YSelector | None = None, + X: DataFrame | None = None, + y: Pandas | None = None, ) -> Pandas | tuple[DataFrame, Pandas]: """Inversely transform the label encoding. @@ -953,36 +947,34 @@ def inverse_transform( Original target column. Only returned if provided. """ - Xt, yt = self._check_input(X, y, columns=getattr(self, "feature_names_in_", None)) - self._log("Inversely cleaning the data...", 1) - if yt is not None and self._estimators: - y_trans = yt.__class__(dtype="object") + if y is not None and self._estimators: + yt = y.__class__(dtype="object") for col in self.target_names_in_: if est := self._estimators.get(col): if est.__class__.__name__ == "LabelEncoder": self._log(f" --> Inversely label-encoding column {col}.", 2) - out = est.inverse_transform(bk.DataFrame(yt)[col]) + out = est.inverse_transform(bk.DataFrame(y)[col]) - elif isinstance(yt, DataFrame): + elif isinstance(y, dataframe_t): self._log(f" --> Inversely label-binarizing column {col}.", 2) out = est.inverse_transform( - yt.loc[:, yt.columns.str.startswith(f"{col}_")].to_numpy() + y.loc[:, y.columns.str.startswith(f"{col}_")].to_numpy() ) # Replace encoded columns with target column - if isinstance(yt, Series): - y_trans = to_series(out, yt.index, col) + if isinstance(y, series_t): + yt = to_series(out, y.index, col) else: - y_trans = merge(y_trans, to_series(out, yt.index, col)) + yt = merge(yt, to_series(out, y.index, col)) else: # Add unchanged column - y_trans = merge(y_trans, bk.DataFrame(yt)[col]) + yt = merge(yt, bk.DataFrame(y)[col]) - yt = y_trans + y = yt - return variable_return(Xt, yt) + return variable_return(X, y) @beartype @@ -1019,16 +1011,16 @@ class Discretizer(TransformerMixin): strategy!="custom". - If sequence: - - For strategy!="custom": Number of bins per column, - allowing for non-uniform width. The n-th value corresponds - to the n-th column that is transformed. Note that - categorical columns are automatically ignored. + - For strategy!="custom": Number of bins per column. The + n-th value corresponds to the n-th column that is + transformed. Categorical columns are ignored. - For strategy="custom": Bin edges with length=n_bins - 1. The outermost edges are always `-inf` and `+inf`, e.g., bins `[1, 2]` indicate `(-inf, 1], (1, 2], (2, inf]`. - If dict: One of the aforementioned options per column, where - the key is the column's name. + the key is the column's name. Columns that are not in the + dictionary are not transformed. labels: sequence, dict or None, default=None Label names with which to replace the binned intervals. @@ -1036,6 +1028,7 @@ class Discretizer(TransformerMixin): - If None: Use default labels of the form `(min_edge, max_edge]`. - If sequence: Labels to use for all columns. - If dict: Labels per column, where the key is the column's name. + Columns that are not in the dictionary use the default labels. device: str, default="cpu" Device on which to run the estimators. Use any string that @@ -1123,13 +1116,13 @@ class Discretizer(TransformerMixin): X, y = load_breast_cancer(return_X_y=True, as_frame=True) print(X["mean radius"]) - disc = Discretizer( + discretizer = Discretizer( strategy="custom", - bins=[13, 18], + bins={"mean radius": [13, 18]}, labels=["small", "medium", "large"], verbose=2, ) - X["mean radius"] = disc.fit_transform(X[["mean radius"]])["mean radius"] + X = discretizer.fit_transform(X) print(X["mean radius"]) ``` @@ -1160,7 +1153,7 @@ def __init__( self.labels = labels @composed(crash, method_to_log) - def fit(self, X: XSelector, y: YSelector | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -1168,7 +1161,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -1178,11 +1171,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: """ - def get_labels( - col: str, - labels: Sequence[str] | dict[str, Sequence[str]], - bins: Sequence[Scalar], - ) -> tuple[str, ...]: + def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: """Get labels for the specified bins. Parameters @@ -1190,9 +1179,6 @@ def get_labels( col: str Name of the column. - labels: sequence or dict - Label names for the binned intervals. - bins: sequence Bin edges. @@ -1202,12 +1188,17 @@ def get_labels( Labels for the column. """ - if isinstance(labels, dict): - default = [ - f"({np.round(bins[i], 2)}, {np.round(bins[i+1], 1)}]" - for i in range(len(bins[:-1])) - ] - labels = labels.get(col, default) + default = [ + f"({np.round(bins[i], 2)}, {np.round(bins[i + 1], 1)}]" + for i in range(len(bins[:-1])) + ] + + if self.labels is None: + labels = tuple(default) + elif isinstance(self.labels, dict): + labels = tuple(self.labels.get(col, default)) + else: + labels = tuple(self.labels) if len(bins) - 1 != len(labels): raise ValueError( @@ -1216,39 +1207,29 @@ def get_labels( f"len(bins)={len(bins) - 1} and len(labels)={len(labels)}." ) - return tuple(labels) + return labels Xt, yt = self._check_input(X, y) self._check_feature_names(Xt, reset=True) self._check_n_features(Xt, reset=True) - self._discretizers: dict[str, Estimator] = {} + self._estimators: dict[str, Estimator] = {} self._labels: dict[str, Sequence[str]] = {} - self._num_cols = list(Xt.select_dtypes(include="number")) self._log("Fitting Discretizer...", 1) - labels: Sequence[str] | dict[str, Sequence[str]] - if self.labels is None: - labels = {} - else: - labels = self.labels - - for i, col in enumerate(self._num_cols): - # Assign the proper bins for this column + for i, col in enumerate(X.select_dtypes(include="number")): + # Assign bins per column if isinstance(self.bins, dict): if col in self.bins: bins_c = self.bins[col] else: - raise ValueError( - "Invalid value for the bins parameter. Column " - f"{col} not found in the dictionary." - ) + continue # Ignore existing column not specified in dict else: bins_c = self.bins if self.strategy != "custom": - if isinstance(bins_c, Sequence): + if isinstance(bins_c, sequence_t): try: bins_x = bins_c[i] # Fetch the i-th bin for the i-th column except IndexError: @@ -1268,7 +1249,7 @@ def get_labels( kwargs["subsample"] = 200000 kwargs["random_state"] = self.random_state - self._discretizers[col] = KBinsDiscretizer( + self._estimators[col] = KBinsDiscretizer( n_bins=bins_x, encode="ordinal", strategy=self.strategy, @@ -1278,12 +1259,11 @@ def get_labels( # Save labels for transform method self._labels[col] = get_labels( col=col, - labels=labels, - bins=self._discretizers[col].bin_edges_[0], + bins=self._estimators[col].bin_edges_[0], ) else: - if not isinstance(bins_c, Sequence): + if not isinstance(bins_c, sequence_t): raise TypeError( f"Invalid type for the bins parameter, got {bins_c}. Only " "a sequence of bin edges is accepted when strategy='custom'." @@ -1297,15 +1277,15 @@ def get_labels( ) # Make of cut a transformer - self._discretizers[col] = FunctionTransformer( + self._estimators[col] = FunctionTransformer( func=bk.cut, - kw_args={"bins": bins_c, "labels": get_labels(col, labels, bins_c)}, + kw_args={"bins": bins_c, "labels": get_labels(col, bins_c)}, ).fit(Xt[[col]]) return self @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Bin the data into intervals. Parameters @@ -1313,7 +1293,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -1322,23 +1302,21 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: Transformed feature set. """ - Xt, yt = self._check_input(X, y, columns=self.feature_names_in_) - self._log("Binning the features...", 1) - for col in self._num_cols: + for col in self._estimators: if self.strategy == "custom": - Xt[col] = self._discretizers[col].transform(Xt[col]) + X[col] = self._estimators[col].transform(X[col]) else: - Xt[col] = self._discretizers[col].transform(Xt[[col]])[:, 0] + X[col] = self._estimators[col].transform(X[[col]]).iloc[:, 0] # Replace cluster values with labels for i, label in enumerate(self._labels[col]): - Xt[col] = Xt[col].replace(i, label) + X[col] = X[col].replace(i, label) - self._log(f" --> Discretizing feature {col} in {Xt[col].nunique()} bins.", 2) + self._log(f" --> Discretizing feature {col} in {X[col].nunique()} bins.", 2) - return Xt + return X @beartype @@ -1496,7 +1474,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: XSelector, y: YSelector | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Note that leaving y=None can lead to errors if the `strategy` @@ -1526,15 +1504,11 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: Estimator instance. """ - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - self.mapping_: dict[str, dict[Hashable, Scalar]] = defaultdict(dict) self._to_value = defaultdict(list) self._categories = {} self._encoders = {} - self._cat_cols = list(Xt.select_dtypes(exclude="number").columns) + self._cat_cols = list(X.select_dtypes(exclude="number").columns) strategies = dict( backwarddifference=BackwardDifferenceEncoder, @@ -1553,7 +1527,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: if isinstance(self.strategy, str): if self.strategy.lower().endswith("encoder"): - self.strategy = self.strategy[:-7] # Remove the Encoder at the end + self.strategy = self.strategy[:-7] # Remove 'Encoder' at the end if self.strategy.lower() not in strategies: raise ValueError( f"Invalid value for the strategy parameter, got {self.strategy}. " @@ -1575,19 +1549,19 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: if self.infrequent_to_value: if self.infrequent_to_value < 1: - infrequent_to_value = int(self.infrequent_to_value * len(Xt)) + infrequent_to_value = int(self.infrequent_to_value * len(X)) else: infrequent_to_value = int(self.infrequent_to_value) self._log("Fitting Encoder...", 1) - for name, column in Xt[self._cat_cols].items(): + for name, column in X[self._cat_cols].items(): # Replace infrequent classes with the string in `value` if self.infrequent_to_value: for category, count in column.value_counts().items(): if count <= infrequent_to_value: self._to_value[name].append(category) - Xt[name] = column.replace(category, self.value) # type: ignore + X[name] = column.replace(category, self.value) # type: ignore # Get the unique categories before fitting self._categories[name] = column.dropna().sort_values().unique().tolist() @@ -1615,7 +1589,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: cols=[name], # Specify to not skip bool columns handle_missing="return_nan", handle_unknown="value", - ).fit(Xt[[name]]) + ).fit(X[[name]]) elif 2 < len(self._categories[name]) <= max_onehot: self._encoders[name] = OneHotEncoder( @@ -1623,12 +1597,12 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: use_cat_names=True, handle_missing="return_nan", handle_unknown="value", - ).fit(Xt[[name]]) + ).fit(X[[name]]) else: - args = [Xt[[name]]] + args = [X[[name]]] if "y" in sign(estimator.fit): - args.append(bk.DataFrame(yt).iloc[:, 0]) + args.append(bk.DataFrame(y).iloc[:, 0]) self._encoders[name] = estimator( cols=[name], @@ -1655,7 +1629,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: return self @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Encode the data. Parameters @@ -1663,7 +1637,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -1672,27 +1646,24 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: Encoded dataframe. """ - check_is_fitted(self) - Xt, yt = self._check_input(X, y, columns=self.feature_names_in_) - self._log("Encoding categorical columns...", 1) - for name, column in Xt[self._cat_cols].items(): + for name, column in X[self._cat_cols].items(): # Convert infrequent classes to value if self._to_value[name]: - Xt[name] = column.replace(self._to_value[name], self.value) + X[name] = column.replace(self._to_value[name], self.value) self._log( f" --> {self._encoders[name].__class__.__name__[:-7]}-encoding " f"feature {name}. Contains {column.nunique()} classes.", 2 ) - # Count the propagated missingXt[[name]] values + # Count the propagated missingX[[name]] values if n_nans := column.isna().sum(): self._log(f" --> Propagating {n_nans} missing values.", 2) # Get the new encoded columns - new_cols = self._encoders[name].transform(Xt[[name]]) + new_cols = self._encoders[name].transform(X[[name]]) # Drop _nan columns (since missing values are propagated) new_cols = new_cols.loc[:, ~new_cols.columns.str.endswith("_nan")] @@ -1703,17 +1674,17 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: # Insert the new columns at old location for i, new_col in enumerate(sorted(new_cols)): - if new_col in Xt: - Xt[new_col] = new_cols[new_col].values # Replace existing column + if new_col in X: + X[new_col] = new_cols[new_col].values # Replace existing column else: # Drop the original column - if name in Xt: - idx = Xt.columns.get_loc(name) - Xt = Xt.drop(columns=name) + if name in X: + idx = X.columns.get_loc(name) + X = X.drop(columns=name) - Xt.insert(idx + i, new_col, new_cols[new_col]) + X.insert(idx + i, new_col, new_cols[new_col]) - return Xt + return X @beartype @@ -1898,7 +1869,7 @@ def __init__( self.max_nan_cols = max_nan_cols @composed(crash, method_to_log) - def fit(self, X: XSelector, y: YSelector | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -1906,7 +1877,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -1915,26 +1886,22 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: Estimator instance. """ - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - if not hasattr(self, "missing_"): self.missing_ = DEFAULT_MISSING self._log("Fitting Imputer...", 1) # Unify all values to impute - Xt = replace_missing(Xt, self.missing_) + X = replace_missing(X, self.missing_) if self.max_nan_rows is not None: if self.max_nan_rows <= 1: - self._max_nan_rows = int(Xt.shape[1] * self.max_nan_rows) + self._max_nan_rows = int(X.shape[1] * self.max_nan_rows) else: self._max_nan_rows = int(self.max_nan_rows) - Xt = Xt.dropna(axis=0, thresh=Xt.shape[1] - self._max_nan_rows) - if Xt.empty: + X = X.dropna(axis=0, thresh=X.shape[1] - self._max_nan_rows) + if X.empty: raise ValueError( "Invalid value for the max_nan_rows parameter, got " f"{self.max_nan_rows}. All rows contain more than " @@ -1944,11 +1911,11 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: if self.max_nan_cols is not None: if self.max_nan_cols <= 1: - max_nan_cols = int(Xt.shape[0] * self.max_nan_cols) + max_nan_cols = int(X.shape[0] * self.max_nan_cols) else: max_nan_cols = int(self.max_nan_cols) - Xt = Xt.drop(columns=Xt.columns[Xt.isna().sum() > max_nan_cols]) + X = X.drop(columns=X.columns[X.isna().sum() > max_nan_cols]) # Load the imputer class from sklearn or cuml (note the different modules) SimpleImputer = self._get_est_class( @@ -1986,23 +1953,25 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: fill_value=self.strat_cat, ) + ColumnTransformer = self._get_est_class("ColumnTransformer", "compose") + self._estimator = ColumnTransformer( transformers=[ - ("num_imputer", num_imputer, list(Xt.select_dtypes(include="number"))), - ("cat_imputer", cat_imputer, list(Xt.select_dtypes(include=CAT_TYPES))), + ("num_imputer", num_imputer, list(X.select_dtypes(include="number"))), + ("cat_imputer", cat_imputer, list(X.select_dtypes(include=CAT_TYPES))), ], remainder="passthrough", n_jobs=self.n_jobs, verbose_feature_names_out=False, - ).set_output(transform="pandas").fit(Xt) + ).fit(X) return self @composed(crash, method_to_log) def transform( self, - X: XSelector, - y: YSelector | None = None, + X: DataFrame, + y: Pandas | None = None, ) -> Pandas | tuple[DataFrame, Pandas]: """Impute the missing values. @@ -2036,9 +2005,6 @@ def transform( Transformed target column. Only returned if provided. """ - check_is_fitted(self) - Xt, yt = self._check_input(X, y, columns=self.feature_names_in_) - num_imputer = self._estimator.named_transformers_["num_imputer"] cat_imputer = self._estimator.named_transformers_["cat_imputer"] @@ -2047,46 +2013,46 @@ def transform( self._log("Imputing missing values...", 1) # Unify all values to impute - Xt = replace_missing(Xt, self.missing_) + X = replace_missing(X, self.missing_) # Drop rows with too many missing values if self.max_nan_rows is not None: - length = len(Xt) - Xt = Xt.dropna(axis=0, thresh=Xt.shape[1] - self._max_nan_rows) - if diff := length - len(Xt): + length = len(X) + X = X.dropna(axis=0, thresh=X.shape[1] - self._max_nan_rows) + if diff := length - len(X): self._log( f" --> Dropping {diff} samples for containing more " f"than {self._max_nan_rows} missing values.", 2 ) if self.strat_num == "drop": - length = len(Xt) - Xt = Xt.dropna(subset=self._estimator.transformers_[0][2], axis=0) - if diff := length - len(Xt): + length = len(X) + X = X.dropna(subset=self._estimator.transformers_[0][2]) + if diff := length - len(X): self._log( f" --> Dropping {diff} samples for containing " f"missing values in numerical columns.", 2 ) if self.strat_cat == "drop": - length = len(Xt) - Xt = Xt.dropna(subset=self._estimator.transformers_[1][2], axis=0) - if diff := length - len(Xt): + length = len(X) + X = X.dropna(subset=self._estimator.transformers_[1][2]) + if diff := length - len(X): self._log( f" --> Dropping {diff} samples for containing " f"missing values in categorical columns.", 2 ) # Print imputation information per feature - for name, column in Xt.items(): + for name, column in X.items(): if nans := column.isna().sum(): # Drop columns with too many missing values if name not in self._estimator.feature_names_in_: self._log( f" --> Dropping feature {name}. Contains {nans} " - f"({nans * 100 // len(Xt)}%) missing values.", 2 + f"({nans * 100 // len(X)}%) missing values.", 2 ) - Xt = Xt.drop(columns=name) + X = X.drop(columns=name) continue if self.strat_num != "drop" and name in num_imputer.feature_names_in_: @@ -2118,16 +2084,16 @@ def transform( f"'{self.strat_cat}' in feature {name}.", 2 ) - Xt = self._estimator.transform(Xt) + X = self._estimator.transform(X) # Make y consistent with X - if yt is not None: - yt = yt[yt.index.isin(Xt.index)] + if y is not None: + y = y[y.index.isin(X.index)] # Reorder columns to original order - Xt = Xt[[fx for fx in self.feature_names_in_ if fx in Xt.columns]] + X = X[[col for col in self.feature_names_in_ if col in X.columns]] - return variable_return(Xt, yt) + return variable_return(X, y) @beartype @@ -2280,7 +2246,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: XSelector, y: YSelector | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -2288,7 +2254,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -2297,12 +2263,6 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: Estimator instance. """ - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - - self._num_cols = list(Xt.select_dtypes(include="number")) - strategies = dict( yeojohnson="PowerTransformer", boxcox="PowerTransformer", @@ -2330,7 +2290,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: ) self._log("Fitting Normalizer...", 1) - self._estimator.fit(Xt[self._num_cols]) + self._estimator.fit(X.select_dtypes(include="number")) # Add the estimator as attribute to the instance setattr(self, f"{self.strategy}_", self._estimator) @@ -2338,7 +2298,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: return self @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Apply the transformations to the data. Parameters @@ -2346,7 +2306,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -2355,24 +2315,15 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: Normalized dataframe. """ - check_is_fitted(self) - Xt, yt = self._check_input(X, y, columns=self.feature_names_in_) - self._log("Normalizing features...", 1) - X_transformed = self._estimator.transform(Xt[self._num_cols]) + Xt = self._estimator.transform(X[self._estimator.feature_names_in_]) - # If all columns were transformed, just swap sets - if len(self._num_cols) != Xt.shape[1]: - # Replace the numerical columns with the transformed values - for i, col in enumerate(self._num_cols): - Xt[col] = X_transformed[:, i] - else: - Xt = to_df(X_transformed, Xt.index, Xt.columns) + X.update(Xt) # Reorder columns to original order - return Xt + return X[self.feature_names_in_] @composed(crash, method_to_log) - def inverse_transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Apply the inverse transformation to the data. Parameters @@ -2380,7 +2331,7 @@ def inverse_transform(self, X: XSelector, y: YSelector | None = None) -> DataFra X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -2389,21 +2340,13 @@ def inverse_transform(self, X: XSelector, y: YSelector | None = None) -> DataFra Original dataframe. """ - check_is_fitted(self) - Xt, yt = self._check_input(X, y) - self._log("Inversely normalizing features...", 1) - X_transformed = self._estimator.inverse_transform(Xt[self._num_cols]) + Xt = self._estimator.inverse_transform(X[self._estimator.feature_names_in_]) + Xt = to_df(Xt, index=X.index, columns=self._estimator.feature_names_in_) - # If all columns were transformed, just swap sets - if len(self._num_cols) != Xt.shape[1]: - # Replace the numerical columns with the transformed values - for i, col in enumerate(self._num_cols): - Xt[col] = X_transformed[:, i] - else: - Xt = to_df(X_transformed, Xt.index, Xt.columns) + X.update(Xt) - return Xt + return X @beartype @@ -2577,8 +2520,8 @@ def __init__( @composed(crash, method_to_log) def transform( self, - X: XSelector, - y: YSelector | None = None, + X: DataFrame, + y: Pandas | None = None, ) -> Pandas | tuple[DataFrame, Pandas]: """Apply the outlier strategy on the data. @@ -2608,8 +2551,6 @@ def transform( Transformed target column. Only returned if provided. """ - Xt, yt = self._check_input(X, y, columns=getattr(self, "feature_names_in_", None)) - # Estimators with their modules strategies = dict( iforest=["IsolationForest", "ensemble"], @@ -2648,7 +2589,7 @@ def transform( self._log("Pruning outliers...", 1) # Prepare dataset (merge with y and exclude categorical columns) - objective = merge(Xt, yt) if self.include_target and yt is not None else Xt + objective = merge(X, y) if self.include_target and y is not None else X objective = objective.select_dtypes(include=["number"]) outliers = [] @@ -2715,22 +2656,19 @@ def transform( self._log(f" --> Dropping {len(mask) - sum(mask)} outliers.", 2) # Keep only the non-outliers from the data - Xt = Xt[mask] - if yt is not None: - yt = yt[mask] + X = X[mask] + if y is not None: + y = y[mask] else: # Replace the columns in X and y with the new values from objective - Xt.update(objective) - if isinstance(yt, Series) and yt.name in objective: - yt.update(objective[str(yt.name)]) - elif isinstance(yt, DataFrame): - yt.update(objective) - - if yt is None: - return Xt - else: - return Xt, yt + X.update(objective) + if isinstance(y, series_t) and y.name in objective: + y.update(objective[str(y.name)]) + elif isinstance(y, dataframe_t): + y.update(objective) + + return variable_return(X, y) @beartype @@ -2863,7 +2801,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: XSelector, y: YSelector | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -2871,7 +2809,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -2880,16 +2818,10 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: Estimator instance. """ - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - - self._num_cols = list(Xt.select_dtypes(include="number")) + num_cols = list(X.select_dtypes(include="number")) if not self.include_binary: - self._num_cols = [ - col for col in self._num_cols if ~np.isin(Xt[col].unique(), [0, 1]).all() - ] + num_cols = [c for c in num_cols if ~np.isin(X[c].unique(), [0, 1]).all()] strategies = dict( standard="StandardScaler", @@ -2902,7 +2834,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: self._estimator = estimator(**self.kwargs) self._log("Fitting Scaler...", 1) - self._estimator.fit(Xt[self._num_cols]) + self._estimator.fit(X[num_cols]) # Add the estimator as attribute to the instance setattr(self, f"{self.strategy}_", self._estimator) @@ -2910,7 +2842,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: return self @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Perform standardization by centering and scaling. Parameters @@ -2918,7 +2850,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -2927,24 +2859,15 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: Scaled dataframe. """ - check_is_fitted(self) - Xt, yt = self._check_input(X, y, columns=self.feature_names_in_) - self._log("Scaling features...", 1) - X_transformed = self._estimator.transform(Xt[self._num_cols]) + Xt = self._estimator.transform(X[self._estimator.feature_names_in_]) - # If all columns were transformed, just swap sets - if len(self._num_cols) != Xt.shape[1]: - # Replace the numerical columns with the transformed values - for i, col in enumerate(self._num_cols): - Xt[col] = X_transformed[:, i] - else: - Xt = to_df(X_transformed, Xt.index, Xt.columns) + X.update(Xt) # Reorder columns to original order - return Xt + return X @composed(crash, method_to_log) - def inverse_transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Apply the inverse transformation to the data. Parameters @@ -2952,7 +2875,7 @@ def inverse_transform(self, X: XSelector, y: YSelector | None = None) -> DataFra X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -2961,18 +2884,10 @@ def inverse_transform(self, X: XSelector, y: YSelector | None = None) -> DataFra Scaled dataframe. """ - check_is_fitted(self) - Xt, yt = self._check_input(X, y) - self._log("Inversely scaling features...", 1) - X_transformed = self._estimator.inverse_transform(Xt[self._num_cols]) + Xt = self._estimator.inverse_transform(X[self._estimator.feature_names_in_]) + Xt = to_df(Xt, index=X.index, columns=self._estimator.feature_names_in_) - # If all columns were transformed, just swap sets - if len(self._num_cols) != Xt.shape[1]: - # Replace the numerical columns with the transformed values - for i, col in enumerate(self._num_cols): - Xt[col] = X_transformed[:, i] - else: - Xt = to_df(X_transformed, Xt.index, Xt.columns) + X.update(Xt) - return Xt + return X diff --git a/atom/ensembles.py b/atom/ensembles.py index 627030948..625f0371d 100644 --- a/atom/ensembles.py +++ b/atom/ensembles.py @@ -10,9 +10,9 @@ from __future__ import annotations from copy import deepcopy +from typing import Any import numpy as np -from beartype.typing import Sequence from joblib import Parallel, delayed from sklearn.base import clone, is_classifier from sklearn.ensemble import StackingClassifier as SC @@ -27,7 +27,7 @@ from sklearn.utils.validation import column_or_1d from typing_extensions import Self -from atom.utils.types import Bool, Int, Predictor, Scalar, XSelector +from atom.utils.types import Bool, Int, Predictor, Scalar, Sequence, XSelector from atom.utils.utils import check_is_fitted @@ -56,7 +56,7 @@ class BaseVoting(BaseEnsemble): def fit( self, X: XSelector, - y: Sequence, + y: Sequence[Any], sample_weight: Sequence[Scalar] | None = None, ) -> Self: """Fit the estimators in the ensemble. @@ -119,7 +119,7 @@ class BaseStacking(BaseEnsemble): def fit( self, X: XSelector, - y: Sequence, + y: Sequence[Any], sample_weight: Sequence[Scalar] | None = None, ) -> Self: """Fit the estimators in the ensemble. @@ -260,7 +260,7 @@ def __init__( def fit( self, X: XSelector, - y: Sequence, + y: Sequence[Any], sample_weight: Sequence[Scalar] | None = None, ) -> Self: """Fit the estimators, skipping prefit ones. @@ -378,7 +378,7 @@ class StackingClassifier(BaseStacking, SC): def fit( self, X: XSelector, - y: Sequence, + y: Sequence[Any], sample_weight: Sequence[Scalar] | None = None, ) -> Self: """Fit the estimators, skipping prefit ones. @@ -423,7 +423,7 @@ class StackingRegressor(BaseStacking, SR): def fit( self, X: XSelector, - y: Sequence, + y: Sequence[Any], sample_weight: Sequence[Scalar] | None = None, ) -> Self: """Fit the estimators, skipping prefit ones. diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index 9811d754d..c2a7ea586 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -9,16 +9,17 @@ from __future__ import annotations +from collections.abc import Hashable from logging import Logger from pathlib import Path from random import sample +from typing import Literal import featuretools as ft import joblib import numpy as np import pandas as pd from beartype import beartype -from beartype.typing import Hashable, Literal, Sequence from gplearn.genetic import SymbolicTransformer from scipy import stats from sklearn.base import is_classifier @@ -40,19 +41,19 @@ Backend, Bool, DataFrame, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, IntLargerEqualZero, IntLargerZero, NJobs, Operators, - Scalar, Series, Verbose, XSelector, YSelector, + Pandas, Scalar, Sequence, Series, Verbose, sequence_t, series_t, ) from atom.utils.utils import ( - Goal, Task, check_is_fitted, check_scaling, composed, crash, - get_custom_scorer, is_sparse, lst, merge, method_to_log, sign, to_df, + Goal, Task, check_scaling, composed, crash, get_custom_scorer, is_sparse, + lst, merge, method_to_log, sign, to_df, ) @beartype class FeatureExtractor(TransformerMixin): - """Extract features from datetime columns. + """EXract features from datetime columns. - Create new features extracting datetime elements (day, month, + Create new features eXracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype `datetime64` are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT @@ -60,7 +61,7 @@ class FeatureExtractor(TransformerMixin): This class can be accessed from atom through the [feature_extraction] [atomclassifier-feature_extraction] method. Read more in the - [user guide][extracting-datetime-features]. + [user guide][eXracting-datetime-features]. !!! warning Decision trees based algorithms build their split rules @@ -145,7 +146,7 @@ class FeatureExtractor(TransformerMixin): === "stand-alone" ```pycon import pandas as pd - from atom.feature_engineering import FeatureExtractor + from atom.feature_engineering import FeatureEXractor from sklearn.datasets import load_breast_cancer X, _ = load_breast_cancer(return_X_y=True, as_frame=True) @@ -153,7 +154,7 @@ class FeatureExtractor(TransformerMixin): # Add a datetime column X["date"] = pd.date_range(start="1/1/2018", periods=len(X)) - fe = FeatureExtractor(features=["day"], fmt="%Y-%m-%d", verbose=2) + fe = FeatureEXractor(features=["day"], fmt="%Y-%m-%d", verbose=2) X = fe.transform(X) # Note the date_day column @@ -179,15 +180,15 @@ def __init__( self.drop_columns = drop_columns @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: - """Extract the new features. + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + """EXract the new features. Parameters ---------- X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -196,36 +197,31 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: Transformed feature set. """ - Xt, yt = self._check_input(X, y, columns=getattr(self, "feature_names_in_", None)) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - - self._log("Extracting datetime features...", 1) + self._log("EXracting datetime features...", 1) i = 0 - for name, column in Xt.select_dtypes(exclude="number").items(): + for name, column in X.select_dtypes(exclude="number").items(): if column.dtype.name == "datetime64[ns]": col_dt = column - self._log(f" --> Extracting features from column {name}.", 1) + self._log(f" --> EXracting features from column {name}.", 1) else: - fmt = self.fmt[i] if isinstance(self.fmt, Seq) else self.fmt col_dt = pd.to_datetime( arg=column, errors="coerce", # Converts to NaT if he can't format - format=fmt, + format=self.fmt[i] if isinstance(self.fmt, sequence_t) else self.fmt, infer_datetime_format=True, ) # If >30% values are NaT, the conversion was unsuccessful - if 100. * col_dt.isna().sum() / len(Xt) >= 30: + if 100. * col_dt.isna().sum() / len(X) >= 30: continue # Skip this column else: i += 1 self._log( - f" --> Extracting features from categorical column {name}.", 1 + f" --> EXracting features from categorical column {name}.", 1 ) - # Extract features from the datetime column + # EXract features from the datetime column for fx in map(str.lower, lst(self.features)): if hasattr(col_dt.dt, fx.lower()): values = getattr(col_dt.dt, fx) @@ -236,9 +232,9 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: ) # Skip if the information is not present in the format - if not isinstance(values, Series): + if not isinstance(values, series_t): self._log( - f" --> Extracting feature {fx} failed. " + f" --> EXracting feature {fx} failed. " "Result is not a Series.dt.", 2 ) continue @@ -266,21 +262,21 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: # Add every new feature after the previous one new_name = f"{name}_{fx}" - idx = Xt.columns.get_loc(name) + idx = X.columns.get_loc(name) if self.encoding_type == "ordinal" or max_val is None: self._log(f" --> Creating feature {new_name}.", 2) - Xt.insert(idx, new_name, values) + X.insert(idx, new_name, values) elif self.encoding_type == "cyclic": self._log(f" --> Creating cyclic feature {new_name}.", 2) pos = 2 * np.pi * (values - min_val) / np.array(max_val) - Xt.insert(idx, f"{new_name}_sin", np.sin(pos)) - Xt.insert(idx + 1, f"{new_name}_cos", np.cos(pos)) + X.insert(idx, f"{new_name}_sin", np.sin(pos)) + X.insert(idx + 1, f"{new_name}_cos", np.cos(pos)) # Drop the original datetime column if self.drop_columns: - Xt = Xt.drop(columns=name) + X = X.drop(columns=name) - return Xt + return X @beartype @@ -377,7 +373,7 @@ class FeatureGenerator(TransformerMixin): See Also -------- - atom.feature_engineering:FeatureExtractor + atom.feature_engineering:FeatureEXractor atom.feature_engineering:FeatureGrouper atom.feature_engineering:FeatureSelector @@ -394,7 +390,7 @@ class FeatureGenerator(TransformerMixin): atom = ATOMClassifier(X, y) atom.feature_generation(strategy="dfs", n_features=5, verbose=2) - # Note the texture error / worst symmetry column + # Note the teXure error / worst symmetry column print(atom.dataset) ``` @@ -438,7 +434,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: XSelector, y: YSelector | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -446,7 +442,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X. - If None: y is ignored. @@ -465,10 +461,6 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: Estimator instance. """ - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - all_operators = dict( add="add_numeric", sub="subtract_numeric", @@ -491,7 +483,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: if self.strategy == "dfs": # Run deep feature synthesis with transformation primitives - es = ft.EntitySet(dataframes={"X": (Xt, "_index", None, None, None, True)}) + es = ft.EntitySet(dataframes={"X": (X, "_index", None, None, None, True)}) self._dfs = ft.dfs( target_dataframe_name="X", entityset=es, @@ -502,7 +494,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: ) # Select the new features (dfs also returns originals) - self._dfs = self._dfs[Xt.shape[1] - 1:] + self._dfs = self._dfs[X.shape[1] - 1:] # Get a random selection of features if self.n_features and self.n_features < len(self._dfs): @@ -521,17 +513,17 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: init_depth=kwargs.pop("init_depth", (1, 2)), const_range=kwargs.pop("const_range", None), function_set=operators, - feature_names=Xt.columns, + feature_names=X.columns, verbose=kwargs.pop("verbose", 0 if self.verbose < 2 else 1), n_jobs=kwargs.pop("n_jobs", self.n_jobs), random_state=kwargs.pop("random_state", self.random_state), **kwargs, - ).fit(Xt, yt) + ).fit(X, y) return self @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Generate new features. Parameters @@ -539,7 +531,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -548,13 +540,10 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: Transformed feature set. """ - check_is_fitted(self) - Xt, yt = self._check_input(X, y, columns=self.feature_names_in_) - self._log("Generating new features...", 1) if self.strategy == "dfs": - es = ft.EntitySet(dataframes={"X": (Xt, "index", None, None, None, True)}) + es = ft.EntitySet(dataframes={"X": (X, "index", None, None, None, True)}) dfs = ft.calculate_feature_matrix( features=self._dfs, entityset=es, @@ -562,7 +551,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: ) # Add the new features to the feature set - Xt = pd.concat([Xt, dfs], axis=1).set_index("index") + X = pd.concat([X, dfs], axis=1).set_index("index") self._log(f" --> {len(self._dfs)} new features were added.", 2) @@ -571,7 +560,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: df = pd.DataFrame( data=[ ["", str(fx), fx.fitness_] - for i, fx in enumerate(self.gfg_) if str(fx) not in Xt.columns + for i, fx in enumerate(self.gfg_) if str(fx) not in X.columns ], columns=["name", "description", "fitness"], ) @@ -581,7 +570,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: self._log( " --> The genetic algorithm didn't find any improving features.", 2 ) - return Xt + return X # Select the n_features with the highest fitness df = df.drop_duplicates() @@ -593,13 +582,13 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: f" --> Dropping {(self.n_features or len(self.gfg_)) - len(df)} " "features due to repetition.", 2) - for i, array in enumerate(self.gfg_.transform(Xt)[:, df.index].T): + for i, array in enumerate(self.gfg_.transform(X)[:, df.index].T): # If the column is new, use a default name counter = 0 while True: - name = f"x{Xt.shape[1] + counter}" - if name not in Xt: - Xt[name] = array # Add new feature to X + name = f"x{X.shape[1] + counter}" + if name not in X: + X[name] = array # Add new feature to X df.iat[i, 0] = name break else: @@ -608,12 +597,12 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: self._log(f" --> {len(df)} new features were added.", 2) self.genetic_features_ = df.reset_index(drop=True) - return Xt + return X @beartype class FeatureGrouper(TransformerMixin): - """Extract statistics from similar features. + """EXract statistics from similar features. Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The @@ -662,7 +651,7 @@ class FeatureGrouper(TransformerMixin): See Also -------- - atom.feature_engineering:FeatureExtractor + atom.feature_engineering:FeatureEXractor atom.feature_engineering:FeatureGenerator atom.feature_engineering:FeatureSelector @@ -689,7 +678,7 @@ class FeatureGrouper(TransformerMixin): X, _ = load_breast_cancer(return_X_y=True, as_frame=True) - fg = FeatureGrouper({"group1": ["mean texture", "mean radius"]}, verbose=2) + fg = FeatureGrouper({"group1": ["mean teXure", "mean radius"]}, verbose=2) X = fg.transform(X) print(X) @@ -712,7 +701,7 @@ def __init__( self.drop_columns = drop_columns @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Group features. Parameters @@ -720,7 +709,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -729,8 +718,6 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: Transformed feature set. """ - Xt, _ = self._check_input(X, y, columns=getattr(self, "feature_names_in_", None)) - self._log("Grouping features...", 1) if self.operators is None: @@ -742,10 +729,10 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: for name, group in self.groups.items(): for operator in operators: try: - result = Xt[group].apply(getattr(np, operator), axis=1) + result = X[group].apply(getattr(np, operator), axis=1) except AttributeError: try: - result = getattr(stats, operator)(Xt[group], axis=1)[0] + result = getattr(stats, operator)(X[group], axis=1)[0] except AttributeError: raise ValueError( "Invalid value for the operators parameter. Value " @@ -753,7 +740,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: ) try: - Xt[f"{operator}({name})"] = result + X[f"{operator}({name})"] = result except ValueError: raise ValueError( "Invalid value for the operators parameter. Value " @@ -764,9 +751,9 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: self._log(f" --> Group {name} successfully created.", 2) if self.drop_columns: - Xt = Xt.drop(columns=to_drop) + X = X.drop(columns=to_drop) - return Xt + return X @beartype @@ -978,7 +965,7 @@ class FeatureSelector(TransformerMixin): number generator is the `RandomState` used by `np.random`. **kwargs - Any extra keyword argument for the strategy estimator. See the + Any eXra keyword argument for the strategy estimator. See the corresponding documentation for the available options. Attributes @@ -1002,7 +989,7 @@ class FeatureSelector(TransformerMixin): See Also -------- - atom.feature_engineering:FeatureExtractor + atom.feature_engineering:FeatureEXractor atom.feature_engineering:FeatureGenerator atom.feature_engineering:FeatureGrouper @@ -1075,7 +1062,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: XSelector, y: YSelector | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit the feature selector to the data. The univariate, sfm (when model is not fitted), sfs, rfe and @@ -1087,7 +1074,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X. - If None: y is ignored. @@ -1124,10 +1111,6 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): model.fit(X_train, y_train) return scoring(model, X_valid, y_valid) - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - self.collinear_ = pd.DataFrame(columns=["drop", "corr_feature", "corr_value"]) self.scaler_ = None @@ -1180,7 +1163,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): }, ) model.task = goal.infer_task(y) - solver = model._get_est() + solver = model._get_est({}) else: raise ValueError( "Invalid value for the solver parameter. Unknown " @@ -1197,9 +1180,9 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ) if self.n_features is None: - self._n_features = Xt.shape[1] + self._n_features = X.shape[1] elif self.n_features < 1: - self._n_features = int(self.n_features * Xt.shape[1]) + self._n_features = int(self.n_features * X.shape[1]) else: self._n_features = self.n_features @@ -1207,15 +1190,15 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): if self.min_repeated is None: min_repeated = 1 elif self.min_repeated <= 1: - min_repeated = self.min_repeated * len(Xt) + min_repeated = self.min_repeated * len(X) else: min_repeated = int(self.min_repeated) max_repeated: Scalar if self.max_repeated is None: - max_repeated = len(Xt) + max_repeated = len(X) elif self.max_repeated <= 1: - max_repeated = self.max_repeated * len(Xt) + max_repeated = self.max_repeated * len(X) else: max_repeated = int(self.max_repeated) @@ -1229,30 +1212,30 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): # Remove features with too high variance if self.min_repeated is not None: - for name, column in Xt.select_dtypes(exclude="number").items(): + for name, column in X.select_dtypes(exclude="number").items(): max_counts = column.value_counts() if min_repeated > max_counts.max(): self._high_variance[name] = (max_counts.idxmax(), max_counts.max()) - Xt = Xt.drop(columns=name) + X = X.drop(columns=name) break # Remove features with too low variance if self.max_repeated is not None: - for name, column in Xt.select_dtypes(exclude="number").items(): + for name, column in X.select_dtypes(exclude="number").items(): for category, count in column.value_counts().items(): if count >= max_repeated: - self._low_variance[name] = (category, 100. * count / len(Xt)) - Xt = Xt.drop(columns=name) + self._low_variance[name] = (category, 100. * count / len(X)) + X = X.drop(columns=name) break # Remove features with too high correlation self.collinear = pd.DataFrame(columns=["drop", "corr_feature", "corr_value"]) if self.max_correlation: # Get the Pearson correlation coefficient matrix - if yt is None: - corr_X = Xt.corr() + if y is None: + corr_X = X.corr() else: - corr_matrix = merge(Xt, yt).corr() + corr_matrix = merge(X, y).corr() corr_X, corr_y = corr_matrix.iloc[:-1, :-1], corr_matrix.iloc[:-1, -1] corr = {} @@ -1288,7 +1271,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ignore_index=True, ) - Xt = Xt.drop(columns=self.collinear_["drop"].tolist()) + X = X.drop(columns=self.collinear_["drop"].tolist()) if self.strategy is None: return self # Exit feature_engineering @@ -1319,14 +1302,14 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): solver = self.solver check_y() - self._estimator = SelectKBest(solver, k=self._n_features).fit(Xt, yt) + self._estimator = SelectKBest(solver, k=self._n_features).fit(X, y) elif self.strategy == "pca": - if not is_sparse(Xt): + if not is_sparse(X): # PCA requires the features to be scaled - if not check_scaling(Xt): + if not check_scaling(X): self.scaler_ = Scaler() - Xt = self.scaler_.fit_transform(Xt) + X = self.scaler_.fit_transform(X) estimator = self._get_est_class("PCA", "decomposition") solver_param = "svd_solver" @@ -1342,13 +1325,13 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): # The PCA and TruncatedSVD both get all possible components to use # for the plots (n_components must be < n_features and <= n_rows) self._estimator = estimator( - n_components=min(len(Xt), Xt.shape[1] - 1), + n_components=min(len(X), X.shape[1] - 1), **{solver_param: solver}, random_state=self.random_state, **self.kwargs, ) - self._estimator.fit(Xt) + self._estimator.fit(X) self._estimator._comps = min( self._estimator.components_.shape[0], self._n_features ) @@ -1371,7 +1354,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): **kwargs, ) if prefit: - if list(getattr(solver, "feature_names_in_", [])) != list(Xt.columns): + if list(getattr(solver, "feature_names_in_", [])) != list(X.columns): raise ValueError( "Invalid value for the solver parameter. The " f"{solver.__class__.__name__} estimator " @@ -1380,7 +1363,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): self._estimator.estimator_ = solver else: check_y() - self._estimator.fit(Xt, yt) + self._estimator.fit(X, y) elif self.strategy in ("sfs", "rfe", "rfecv"): if self.strategy == "sfs": @@ -1412,7 +1395,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): kwargs["scoring"] = get_custom_scorer(self.kwargs["scoring"]) # Invert n_features to select them all (default option) - if self._n_features == Xt.shape[1]: + if self._n_features == X.shape[1]: self._n_features = 1 self._estimator = RFECV( @@ -1423,7 +1406,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ) with joblib.parallel_backend(backend=self.backend): - self._estimator.fit(Xt, yt) + self._estimator.fit(X, y) else: check_y() @@ -1440,7 +1423,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): "cannot be absent when X_valid is provided." ) else: - X_valid, y_valid = Xt, yt + X_valid, y_valid = X, y # Get scoring for default objective_function if "objective_function" not in kwargs: @@ -1478,7 +1461,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): return self @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Transform the data. Parameters @@ -1486,7 +1469,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -1495,9 +1478,6 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: Transformed feature set. """ - check_is_fitted(self) - Xt, yt = self._check_input(X, y, columns=self.feature_names_in_) - self._log("Performing feature selection ...", 1) # Remove features with too high variance @@ -1505,9 +1485,9 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: self._log( f" --> Feature {fx} was removed due to high variance. " f"Value {h_variance[0]} was the most repeated value with " - f"{h_variance[1]} ({h_variance[1] / len(Xt):.1f}%) occurrences.", 2 + f"{h_variance[1]} ({h_variance[1] / len(X):.1f}%) occurrences.", 2 ) - Xt = Xt.drop(columns=fx) + X = X.drop(columns=fx) # Remove features with too low variance for fx, l_variance in self._low_variance.items(): @@ -1515,7 +1495,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: f" --> Feature {fx} was removed due to low variance. Value " f"{l_variance[0]} repeated in {l_variance[1]:.1f}% of the rows.", 2 ) - Xt = Xt.drop(columns=fx) + X = X.drop(columns=fx) # Remove features with too high correlation for col in self.collinear_["drop"]: @@ -1523,36 +1503,36 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: f" --> Feature {col} was removed due to " "collinearity with another feature.", 2 ) - Xt = Xt.drop(columns=col) + X = X.drop(columns=col) # Perform selection based on strategy if self.strategy is None: - return Xt + return X elif self.strategy == "univariate": self._log( f" --> The univariate test selected " f"{self._n_features} features from the dataset.", 2 ) - for n, column in enumerate(Xt): + for n, column in enumerate(X): if not self.univariate_.get_support()[n]: self._log( f" --> Dropping feature {column} " f"(score: {self.univariate_.scores_[n]:.2f} " f"p-value: {self.univariate_.pvalues_[n]:.2f}).", 2 ) - Xt = Xt.drop(columns=column) + X = X.drop(columns=column) elif self.strategy == "pca": self._log(" --> Applying Principal Component Analysis...", 2) if self.scaler_: self._log(" --> Scaling features...", 2) - Xt = self.scaler_.transform(Xt) + X = self.scaler_.transform(X) - Xt = to_df( - data=self.pca_.transform(Xt)[:, :self.pca_._comps], - index=Xt.index, + X = to_df( + data=self.pca_.transform(X)[:, :self.pca_._comps], + index=X.index, columns=[f"pca{str(i)}" for i in range(self.pca_._comps)], ) @@ -1567,7 +1547,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: f"{sum(mask)} features from the dataset.", 2 ) - for n, column in enumerate(Xt): + for n, column in enumerate(X): if not mask[n]: if hasattr(self._estimator, "ranking_"): self._log( @@ -1576,7 +1556,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: ) else: self._log(f" --> Dropping feature {column}.", 2) - Xt = Xt.drop(columns=column) + X = X.drop(columns=column) else: # Advanced strategies self._log( @@ -1584,9 +1564,9 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: "features from the dataset.", 2 ) - for column in Xt: + for column in X: if column not in self._estimator.best_feature_list: self._log(f" --> Dropping feature {column}.", 2) - Xt = Xt.drop(columns=column) + X = X.drop(columns=column) - return Xt + return X diff --git a/atom/models/classreg.py b/atom/models/classreg.py index aab000f61..3563277c2 100644 --- a/atom/models/classreg.py +++ b/atom/models/classreg.py @@ -9,9 +9,10 @@ from __future__ import annotations +from typing import Any, cast + import numpy as np import pandas as pd -from beartype.typing import Any, cast from optuna.distributions import BaseDistribution from optuna.distributions import CategoricalDistribution as Cat from optuna.distributions import FloatDistribution as Float @@ -449,13 +450,13 @@ def _get_parameters(self, trial: Trial) -> dict: return params - def _get_est(self, **params) -> Predictor: + def _get_est(self, params: dict[str, Any]) -> Predictor: """Get the estimator instance. Parameters ---------- - **params - Unpacked hyperparameters for the estimator. + params: dict + Hyperparameters for the estimator. Returns ------- @@ -1627,9 +1628,14 @@ class LightGBM(ClassRegModel): _module = "lightgbm.sklearn" _estimators = {"classification": "LGBMClassifier", "regression": "LGBMRegressor"} - def _get_est(self, **params) -> Predictor: + def _get_est(self, params: dict[str, Any]) -> Predictor: """Get the model's estimator with unpacked parameters. + Parameters + ---------- + params: dict + Hyperparameters for the estimator. + Returns ------- Predictor @@ -1911,13 +1917,13 @@ def _get_parameters(self, trial: Trial) -> dict: return params - def _get_est(self, **params) -> Predictor: + def _get_est(self, params: dict[str, Any]) -> Predictor: """Get the estimator instance. Parameters ---------- - **params - Unpacked hyperparameters for the estimator. + params: dict + Hyperparameters for the estimator. Returns ------- @@ -1928,7 +1934,7 @@ def _get_est(self, **params) -> Predictor: if self.engine.get("estimator") == "cuml" and self._goal is Goal.classification: return self._est_class(probability=params.pop("probability", True), **params) else: - return super()._get_est(**params) + return super()._get_est(params) def _get_distributions(self) -> dict[str, BaseDistribution]: """Get the predefined hyperparameter distributions. @@ -2941,9 +2947,14 @@ def _get_parameters(self, trial: Trial) -> dict: return params - def _get_est(self, **params) -> Predictor: + def _get_est(self, params: dict[str, Any]) -> Predictor: """Get the model's estimator with unpacked parameters. + Parameters + ---------- + params: dict + Hyperparameters for the estimator. + Returns ------- Predictor @@ -2956,7 +2967,7 @@ def _get_est(self, **params) -> Predictor: random_state=params.pop("random_state", self.random_state), **params) else: - return super()._get_est(**params) + return super()._get_est(params) def _get_distributions(self) -> dict[str, BaseDistribution]: """Get the predefined hyperparameter distributions. @@ -3060,9 +3071,14 @@ def trials(self) -> pd.DataFrame: return trials - def _get_est(self, **params) -> Predictor: + def _get_est(self, params: dict[str, Any]) -> Predictor: """Get the model's estimator with unpacked parameters. + Parameters + ---------- + params: dict + Hyperparameters for the estimator. + Returns ------- Predictor diff --git a/atom/models/custom.py b/atom/models/custom.py index 6b9b785d4..bbf3677e3 100644 --- a/atom/models/custom.py +++ b/atom/models/custom.py @@ -7,7 +7,7 @@ """ -from beartype.typing import Any +from typing import Any from atom.basemodel import ClassRegModel from atom.utils.types import Predictor diff --git a/atom/models/ensembles.py b/atom/models/ensembles.py index 620435b24..c9f928bd6 100644 --- a/atom/models/ensembles.py +++ b/atom/models/ensembles.py @@ -9,6 +9,8 @@ from __future__ import annotations +from typing import Any + from atom.basemodel import ClassRegModel from atom.utils.types import Model, Predictor from atom.utils.utils import sign @@ -46,9 +48,14 @@ def __init__(self, models: list[Model], **kwargs): super().__init__(**kw_model) self._est_params = {k: v for k, v in kwargs.items() if k not in kw_model} - def _get_est(self, **params) -> Predictor: + def _get_est(self, params: dict[str, Any]) -> Predictor: """Get the model's estimator with unpacked parameters. + Parameters + ---------- + params: dict + Hyperparameters for the estimator. + Returns ------- Predictor @@ -103,9 +110,14 @@ def __init__(self, models: list[Model], **kwargs): f"a predict_proba method, got {m.fullname}." ) - def _get_est(self, **params) -> Predictor: + def _get_est(self, params: dict[str, Any]) -> Predictor: """Get the model's estimator with unpacked parameters. + Parameters + ---------- + params: dict + Hyperparameters for the estimator. + Returns ------- Predictor diff --git a/atom/models/ts.py b/atom/models/ts.py index ba1894e32..8a6868325 100644 --- a/atom/models/ts.py +++ b/atom/models/ts.py @@ -9,7 +9,8 @@ from __future__ import annotations -from beartype.typing import Any +from typing import Any + from optuna.distributions import BaseDistribution from optuna.distributions import CategoricalDistribution as Cat from optuna.distributions import IntDistribution as Int diff --git a/atom/nlp.py b/atom/nlp.py index 31dbe5413..b0171bfc0 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -18,7 +18,6 @@ import nltk import pandas as pd from beartype import beartype -from beartype.typing import Sequence from nltk.collocations import ( BigramCollocationFinder, QuadgramCollocationFinder, TrigramCollocationFinder, @@ -29,12 +28,12 @@ from atom.data_cleaning import TransformerMixin from atom.utils.types import ( - Bool, DataFrame, Engine, FloatLargerZero, VectorizerStarts, Verbose, - XSelector, YSelector, + Bool, DataFrame, Engine, FloatLargerZero, Pandas, Sequence, + VectorizerStarts, Verbose, bool_t, ) from atom.utils.utils import ( - check_is_fitted, check_nltk_module, composed, crash, get_corpus, is_sparse, - merge, method_to_log, to_df, + check_nltk_module, composed, crash, get_corpus, is_sparse, merge, + method_to_log, to_df, ) @@ -202,7 +201,7 @@ def __init__( self.drop_punctuation = drop_punctuation @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Apply the transformations to the data. Parameters @@ -212,7 +211,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -237,7 +236,7 @@ def to_ascii(elem: str) -> str: """ try: - elem.encode("ASCII", errors="strict") # Returns bytes object + elem.encode("ASCII", errors="strict") # Returns byes object except UnicodeEncodeError: norm = unicodedata.normalize("NFKD", elem) return "".join([c for c in norm if not unicodedata.combining(c)]) @@ -253,29 +252,28 @@ def drop_regex(regex: str): Regex pattern to replace. """ - if isinstance(Xt[corpus].iat[0], str): - Xt[corpus] = Xt[corpus].str.replace(regex, "", regex=True) + if isinstance(X[corpus].iat[0], str): + X[corpus] = X[corpus].str.replace(regex, "", regex=True) else: - Xt[corpus] = Xt[corpus].apply(lambda x: [re.sub(regex, "", w) for w in x]) + X[corpus] = X[corpus].apply(lambda x: [re.sub(regex, "", w) for w in x]) - Xt, yt = self._check_input(X, y, columns=getattr(self, "feature_names_in_", None)) - corpus = get_corpus(Xt) + corpus = get_corpus(X) self._log("Cleaning the corpus...", 1) if self.decode: - if isinstance(Xt[corpus].iat[0], str): - Xt[corpus] = Xt[corpus].apply(lambda x: to_ascii(x)) + if isinstance(X[corpus].iat[0], str): + X[corpus] = X[corpus].apply(lambda x: to_ascii(x)) else: - Xt[corpus] = Xt[corpus].apply(lambda doc: [to_ascii(str(w)) for w in doc]) + X[corpus] = X[corpus].apply(lambda doc: [to_ascii(str(w)) for w in doc]) self._log(" --> Decoding unicode characters to ascii.", 2) if self.lower_case: self._log(" --> Converting text to lower case.", 2) - if isinstance(Xt[corpus].iat[0], str): - Xt[corpus] = Xt[corpus].str.lower() + if isinstance(X[corpus].iat[0], str): + X[corpus] = X[corpus].str.lower() else: - Xt[corpus] = Xt[corpus].apply(lambda doc: [str(w).lower() for w in doc]) + X[corpus] = X[corpus].apply(lambda doc: [str(w).lower() for w in doc]) if self.drop_email: if not self.regex_email: @@ -315,17 +313,17 @@ def drop_regex(regex: str): if self.drop_punctuation: self._log(" --> Dropping punctuation from the text.", 2) trans_table = str.maketrans("", "", punctuation) # Translation table - if isinstance(Xt[corpus].iat[0], str): + if isinstance(X[corpus].iat[0], str): func = lambda doc: doc.translate(trans_table) else: func = lambda doc: [str(w).translate(trans_table) for w in doc] - Xt[corpus] = Xt[corpus].apply(func) + X[corpus] = X[corpus].apply(func) # Drop empty tokens from every document - if not isinstance(Xt[corpus].iat[0], str): - Xt[corpus] = Xt[corpus].apply(lambda doc: [w for w in doc if w]) + if not isinstance(X[corpus].iat[0], str): + X[corpus] = X[corpus].apply(lambda doc: [w for w in doc if w]) - return Xt + return X @beartype @@ -461,7 +459,7 @@ def __init__( self.lemmatize = lemmatize @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Normalize the text. Parameters @@ -471,7 +469,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -504,18 +502,17 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: else: # "NN", "NNS", "NNP", "NNPS" return wordnet.NOUN - Xt, yt = self._check_input(X, y, columns=getattr(self, "feature_names_in_", None)) - corpus = get_corpus(Xt) + corpus = get_corpus(X) self._log("Normalizing the corpus...", 1) # If the corpus is not tokenized, separate by space - if isinstance(Xt[corpus].iat[0], str): - Xt[corpus] = Xt[corpus].apply(lambda row: row.split()) + if isinstance(X[corpus].iat[0], str): + X[corpus] = X[corpus].apply(lambda row: row.split()) stopwords = set() if self.stopwords: - if isinstance(self.stopwords, Bool): + if isinstance(self.stopwords, bool_t): self.stopwords = "english" # Get stopwords from the NLTK library @@ -529,15 +526,15 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: if stopwords: self._log(" --> Dropping stopwords.", 2) f = lambda row: [word for word in row if word not in stopwords] - Xt[corpus] = Xt[corpus].apply(f) + X[corpus] = X[corpus].apply(f) if self.stem: - if isinstance(self.stem, Bool): + if isinstance(self.stem, bool_t): self.stem = "english" self._log(" --> Applying stemming.", 2) ss = SnowballStemmer(language=self.stem.lower()) - Xt[corpus] = Xt[corpus].apply(lambda row: [ss.stem(word) for word in row]) + X[corpus] = X[corpus].apply(lambda row: [ss.stem(word) for word in row]) if self.lemmatize: self._log(" --> Applying lemmatization.", 2) @@ -547,9 +544,9 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: wnl = WordNetLemmatizer() f = lambda row: [wnl.lemmatize(w, pos(tag)) for w, tag in nltk.pos_tag(row)] - Xt[corpus] = Xt[corpus].apply(f) + X[corpus] = X[corpus].apply(f) - return Xt + return X @beartype @@ -689,7 +686,7 @@ def __init__( self.quadgram_freq = quadgram_freq @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Tokenize the text. Parameters @@ -699,7 +696,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -736,14 +733,13 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: return row_c[2:-2].split(sep) - Xt, yt = self._check_input(X, y, columns=getattr(self, "feature_names_in_", None)) - corpus = get_corpus(Xt) + corpus = get_corpus(X) self._log("Tokenizing the corpus...", 1) - if isinstance(Xt[corpus].iat[0], str): + if isinstance(X[corpus].iat[0], str): check_nltk_module("tokenizers/punkt", self.verbose < 2) - Xt[corpus] = Xt[corpus].apply(lambda row: nltk.word_tokenize(row)) + X[corpus] = X[corpus].apply(lambda row: nltk.word_tokenize(row)) ngrams = { "bigrams": BigramCollocationFinder, @@ -754,7 +750,7 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: for attr, finder in ngrams.items(): if frequency := getattr(self, f"{attr[:-1]}_freq"): # Search for all n-grams in the corpus - ngram_fd = finder.from_documents(Xt[corpus]).ngram_fd + ngram_fd = finder.from_documents(X[corpus]).ngram_fd if frequency < 1: frequency = int(frequency * len(ngram_fd)) @@ -765,7 +761,7 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: if freq >= frequency: occur += 1 counts += freq - Xt[corpus] = Xt[corpus].apply(replace_ngrams, args=(ngram,)) + X[corpus] = X[corpus].apply(replace_ngrams, args=(ngram,)) rows.append({attr[:-1]: "_".join(ngram), "frequency": freq}) if rows: @@ -777,7 +773,7 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: else: self._log(f" --> No {attr} found in the corpus.", 2) - return Xt + return X @beartype @@ -935,7 +931,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: XSelector, y: YSelector | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -945,7 +941,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -954,14 +950,11 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: Estimator instance. """ - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - corpus = get_corpus(Xt) + corpus = get_corpus(X) # Convert a sequence of tokens to space separated string - if not isinstance(Xt[corpus].iat[0], str): - Xt[corpus] = Xt[corpus].apply(lambda row: " ".join(row)) + if not isinstance(X[corpus].iat[0], str): + X[corpus] = X[corpus].apply(lambda row: " ".join(row)) strategies = dict( bow="CountVectorizer", @@ -976,7 +969,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: self._estimator = estimator(**self.kwargs) self._log("Fitting Vectorizer...", 1) - self._estimator.fit(Xt[corpus]) + self._estimator.fit(X[corpus]) # Add the estimator as attribute to the instance setattr(self, f"{self.strategy}_", self._estimator) @@ -984,7 +977,7 @@ def fit(self, X: XSelector, y: YSelector | None = None) -> Self: return self @composed(crash, method_to_log) - def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Vectorize the text. Parameters @@ -994,7 +987,7 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, series-like, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API. Returns @@ -1003,24 +996,22 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: Transformed corpus. """ - check_is_fitted(self) - Xt, yt = self._check_input(X, y, columns=self.feature_names_in_) - corpus = get_corpus(Xt) + corpus = get_corpus(X) self._log("Vectorizing the corpus...", 1) # Convert a sequence of tokens to space-separated string - if not isinstance(Xt[corpus].iat[0], str): - Xt[corpus] = Xt[corpus].apply(lambda row: " ".join(row)) + if not isinstance(X[corpus].iat[0], str): + X[corpus] = X[corpus].apply(lambda row: " ".join(row)) - matrix = self._estimator.transform(Xt[corpus]) + matrix = self._estimator.transform(X[corpus]) if hasattr(self._estimator, "get_feature_names_out"): columns = [f"corpus_{w}" for w in self._estimator.get_feature_names_out()] else: # Hashing has no words to put as column names columns = [f"hash{i}" for i in range(1, matrix.shape[1] + 1)] - Xt = Xt.drop(columns=corpus) # Drop original corpus column + X = X.drop(columns=corpus) # Drop original corpus column if "sklearn" not in self._estimator.__class__.__module__: matrix = matrix.get() # Convert cupy sparse array back to scipy @@ -1033,11 +1024,11 @@ def transform(self, X: XSelector, y: YSelector | None = None) -> DataFrame: if not self.return_sparse: self._log(" --> Converting the output to a full array.", 2) matrix = matrix.toarray() - elif not Xt.empty and not is_sparse(Xt): + elif not X.empty and not is_sparse(X): # Raise if there are other columns that are non-sparse raise ValueError( "Invalid value for the return_sparse parameter. The value must " "must be False when X contains non-sparse columns (besides corpus)." ) - return merge(Xt, to_df(matrix, Xt.index, columns)) + return merge(X, to_df(matrix, X.index, columns)) diff --git a/atom/pipeline.py b/atom/pipeline.py index 1232b47ef..588518ade 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -9,8 +9,10 @@ from __future__ import annotations +from collections.abc import Iterator +from typing import Any, Literal + import numpy as np -from beartype.typing import Any, Iterator, Literal, Sequence from joblib import Memory from sklearn.base import clone from sklearn.pipeline import Pipeline as SkPipeline @@ -21,8 +23,8 @@ from typing_extensions import Self from atom.utils.types import ( - Bool, DataFrame, Estimator, Float, Pandas, Scalar, Verbose, XSelector, - YSelector, + Bool, DataFrame, Estimator, Float, Pandas, Scalar, Sequence, Verbose, + XSelector, YSelector, ) from atom.utils.utils import ( NotFittedError, adjust_verbosity, check_is_fitted, fit_one, diff --git a/atom/plots/basefigure.py b/atom/plots/basefigure.py index da396a6d4..347d6236b 100644 --- a/atom/plots/basefigure.py +++ b/atom/plots/basefigure.py @@ -10,15 +10,15 @@ from __future__ import annotations from itertools import cycle +from typing import Any, Literal import matplotlib.pyplot as plt import plotly.express as px import plotly.graph_objects as go -from beartype.typing import Any, Literal, Sequence from atom.utils.types import ( Bool, FloatZeroToOneExc, Int, IntLargerZero, Legend, Model, PlotBackend, - Scalar, Style, + Scalar, Sequence, Style, sequence_t, ) from atom.utils.utils import divide, rnd, to_rgb @@ -81,7 +81,7 @@ def __init__( if isinstance(palette, str): self._palette = getattr(px.colors.qualitative, palette) self.palette = cycle(self._palette) - elif isinstance(palette, Sequence): + elif isinstance(palette, sequence_t): # Convert color names or hex to rgb self._palette = list(map(to_rgb, palette)) self.palette = cycle(self._palette) diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py index 389a47cd5..679b4687f 100644 --- a/atom/plots/baseplot.py +++ b/atom/plots/baseplot.py @@ -10,15 +10,15 @@ from __future__ import annotations from abc import ABCMeta, abstractmethod +from collections.abc import Iterator from contextlib import contextmanager from pathlib import Path -from typing import overload +from typing import Any, Literal, overload import matplotlib.pyplot as plt import plotly.express as px import plotly.graph_objects as go from beartype import beartype -from beartype.typing import Any, Iterator, Literal, Sequence from mlflow.tracking import MlflowClient from atom.basetracker import BaseTracker @@ -29,7 +29,7 @@ from atom.utils.types import ( Bool, DataFrame, FloatLargerZero, FloatZeroToOneExc, Index, Int, IntLargerZero, Legend, MetricSelector, Model, ModelsSelector, PlotBackend, - RowSelector, Scalar, + RowSelector, Scalar, Sequence, int_t, sequence_t, ) from atom.utils.utils import ( Aesthetics, Task, check_is_fitted, composed, crash, get_custom_scorer, lst, @@ -226,7 +226,7 @@ def _get_set( Selection of rows. """ - if isinstance(rows, Sequence): + if isinstance(rows, sequence_t): rows_c = {row: row for row in rows} elif isinstance(rows, str): rows_c = {rows: rows} @@ -258,7 +258,7 @@ def _get_metric(self, metric: MetricSelector, max_one: Bool = False) -> list[str else: inc: list[str] = [] for met in lst(metric): - if isinstance(met, Int): + if isinstance(met, int_t): if int(met) < len(self._metric): inc.append(self._metric[met].name) else: @@ -611,7 +611,7 @@ def _plot( position = dict(x=0.99, y=0.5, xanchor="right", yanchor="middle") elif legend == "center": position = dict(x=0.5, y=0.5, xanchor="center", yanchor="middle") - legend = default_legend | position + legend = default_legend | position elif isinstance(legend, dict): legend = default_legend | legend diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py index 767a0b911..b1afe3014 100644 --- a/atom/plots/dataplot.py +++ b/atom/plots/dataplot.py @@ -11,12 +11,12 @@ from abc import ABCMeta, abstractmethod from pathlib import Path +from typing import Any, Literal import numpy as np import pandas as pd import plotly.graph_objects as go from beartype import beartype -from beartype.typing import Any, Literal, Sequence from nltk.collocations import ( BigramCollocationFinder, QuadgramCollocationFinder, TrigramCollocationFinder, @@ -28,7 +28,7 @@ from atom.utils.constants import PALETTE from atom.utils.types import ( Bool, ColumnSelector, DataFrame, Int, IntLargerZero, Legend, RowSelector, - Segment, Series, + Segment, Sequence, Series, ) from atom.utils.utils import ( check_dependency, crash, divide, get_corpus, lst, replace_missing, rnd, diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py index 832fdfb5d..658ad540c 100644 --- a/atom/plots/hyperparametertuningplot.py +++ b/atom/plots/hyperparametertuningplot.py @@ -10,12 +10,13 @@ from __future__ import annotations from abc import ABCMeta +from collections.abc import Callable from datetime import datetime from pathlib import Path +from typing import Any import numpy as np import plotly.graph_objects as go -from beartype.typing import Any, Callable, Sequence from optuna.importance import FanovaImportanceEvaluator from optuna.trial import TrialState from optuna.visualization._parallel_coordinate import ( @@ -29,7 +30,8 @@ from atom.utils.constants import PALETTE from atom.utils.types import ( Bool, Int, IntLargerEqualZero, IntLargerZero, Legend, MetricSelector, - Model, ModelSelector, ModelsSelector, ParamsSelector, Scalar, Segment, + Model, ModelSelector, ModelsSelector, ParamsSelector, Scalar, Sequence, + int_t, segment_t, ) from atom.utils.utils import ( bk, check_dependency, crash, divide, get_segment, it, lst, rnd, @@ -95,12 +97,12 @@ def _get_hyperparams(params: ParamsSelector | None, model: Model) -> list[str]: """ if params is None: params_c = list(model._ht["distributions"]) - elif isinstance(params, Segment): + elif isinstance(params, segment_t): params_c = get_segment(list(model._ht["distributions"]), params) else: params_c = [] for param in lst(params): - if isinstance(param, Int): + if isinstance(param, int_t): params_c.append(list(model._ht["distributions"])[param]) elif isinstance(param, str): for p in param.split("+"): @@ -741,7 +743,7 @@ def sort_mixed_types(values: list[str]) -> list[str]: for d in dims: if "ticktext" in d: # Skip processing for logarithmic params - if all(isinstance(i, Int) for i in d["values"]): + if all(isinstance(i, int_t) for i in d["values"]): # Order categorical values mapping = [d["ticktext"][i] for i in d["values"]] d["ticktext"] = sort_mixed_types(d["ticktext"]) diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py index 38f406d3c..2cfd7143f 100644 --- a/atom/plots/predictionplot.py +++ b/atom/plots/predictionplot.py @@ -14,13 +14,13 @@ from functools import reduce from itertools import chain from pathlib import Path +from typing import Any, Literal import matplotlib.pyplot as plt import numpy as np import pandas as pd import plotly.graph_objects as go from beartype import beartype -from beartype.typing import Any, Literal from joblib import Parallel, delayed from plotly.colors import unconvert_from_RGB_255, unlabel_rgb from scipy import stats @@ -688,7 +688,7 @@ def plot_errors( # Fit the points using linear regression from atom.models import OrdinaryLeastSquares model = OrdinaryLeastSquares(goal=self.task.goal, branches=self._branches) - estimator = model._get_est().fit(bk.DataFrame(y_true), y_pred) + estimator = model._get_est({}).fit(bk.DataFrame(y_true), y_pred) fig.add_trace( self._draw_line( diff --git a/atom/plots/shapplot.py b/atom/plots/shapplot.py index 34773319a..87c717b1c 100644 --- a/atom/plots/shapplot.py +++ b/atom/plots/shapplot.py @@ -10,13 +10,14 @@ from __future__ import annotations from abc import ABCMeta +from collections.abc import Hashable from importlib.util import find_spec from pathlib import Path +from typing import Any import matplotlib.pyplot as plt import shap from beartype import beartype -from beartype.typing import Any, Hashable from atom.plots.baseplot import BasePlot from atom.utils.types import ( diff --git a/atom/training.py b/atom/training.py index 450a42e09..b2fcea5c8 100644 --- a/atom/training.py +++ b/atom/training.py @@ -12,19 +12,19 @@ from copy import copy from logging import Logger from pathlib import Path +from typing import Any, Literal import numpy as np import pandas as pd from beartype import beartype -from beartype.typing import Any, Literal from joblib.memory import Memory from sklearn.base import BaseEstimator from atom.basetrainer import BaseTrainer from atom.utils.types import ( - Backend, Bool, Engine, FloatLargerZero, Int, IntLargerEqualZero, + Backend, Bool, Engine, FloatLargerZero, IntLargerEqualZero, MetricConstructor, ModelsConstructor, NItems, NJobs, Sequence, Verbose, - Warnings, + Warnings, int_t, ) from atom.utils.utils import ( ClassMap, Goal, composed, crash, lst, method_to_log, @@ -204,7 +204,7 @@ def run(self, *arrays): self._log(f"Metric: {', '.join(lst(self.metric))}", 1) # Convert integer train_sizes to sequence - if isinstance(self.train_sizes, Int): + if isinstance(self.train_sizes, int_t): self.train_sizes = np.linspace(1 / self.train_sizes, 1.0, self.train_sizes) models = ClassMap() diff --git a/atom/utils/types.py b/atom/utils/types.py index 7f819e88d..c7adb2fe6 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -9,16 +9,18 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Annotated +from collections.abc import Callable, Hashable, Iterable, Iterator +from typing import ( + TYPE_CHECKING, Annotated, Any, Literal, SupportsIndex, TypeAlias, + TypedDict, TypeVar, overload, runtime_checkable, +) import modin.pandas as md import numpy as np import pandas as pd import scipy.sparse as sps -from beartype.typing import ( - Any, Callable, Hashable, Iterable, Literal, Protocol, Sequence, TypeAlias, - TypedDict, TypeVar, runtime_checkable, -) +from beartype.door import is_bearable +from beartype.typing import Protocol from beartype.vale import Is from optuna.distributions import BaseDistribution from sktime.forecasting.base import ForecastingHorizon @@ -30,8 +32,42 @@ # Classes for type hinting ========================================= >> -T = TypeVar("T") -T_cov = TypeVar("T_cov", covariant=True) +_T = TypeVar("_T") + + +class Sequence(Protocol[_T]): + """Type hint factory for sequences with subscripted types. + + Dynamically creates new `Annotated[Sequence[...], ...]` type hints, + subscripted by the passed type. For subscripted types, it passes + when the type is an array-like and all items in the sequence are of + the subscripted type. + + Parameters + ---------- + _T: object + Arbitrary child type hint to subscript the protocol. + + Notes + ----- + See https://github.com/beartype/beartype/discussions/277#discussioncomment-7086878 + + """ + + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[_T]: ... + @overload + def __getitem__(self, __i: SupportsIndex, /) -> _T: ... + @overload + def __getitem__(self, __s: slice, /) -> Sequence[_T]: ... + + @classmethod + def __class_getitem__(cls, item: Any) -> Annotated[Any, Is]: + return Annotated[ + cls, + Is[lambda lst: isinstance(lst, sequence_t)] + & Is[lambda lst: all(is_bearable(i, item) for i in lst)] + ] class Engine(TypedDict, total=False): @@ -109,7 +145,6 @@ def predict(self, *args, **kwargs) -> Pandas: ... # Variable types for type hinting ================================== >> # General types -# TODO: From Python 3.11, import Self type hint from typing Bool: TypeAlias = bool | np.bool_ Int: TypeAlias = int | np.integer Float: TypeAlias = float | np.floating @@ -127,11 +162,11 @@ def predict(self, *args, **kwargs) -> Pandas: ... Series: TypeAlias = pd.Series | md.Series DataFrame: TypeAlias = pd.DataFrame | md.DataFrame Pandas: TypeAlias = Series | DataFrame -Seq1dim: TypeAlias = list | tuple | np.ndarray | Index | Series # Numerical types IntLargerZero: TypeAlias = Annotated[Int, Is[lambda x: x > 0]] IntLargerEqualZero: TypeAlias = Annotated[Int, Is[lambda x: x >= 0]] +IntLargerOne: TypeAlias = Annotated[Int, Is[lambda x: x > 1]] IntLargerTwo: TypeAlias = Annotated[Int, Is[lambda x: x > 2]] IntLargerFour: TypeAlias = Annotated[Int, Is[lambda x: x > 4]] FloatLargerZero: TypeAlias = Annotated[Scalar, Is[lambda x: x > 0]] @@ -141,17 +176,17 @@ def predict(self, *args, **kwargs) -> Pandas: ... # Types for X, y and fh XTypes: TypeAlias = ( - dict[str, Sequence] - | Sequence[Sequence] - | Iterable[Sequence | tuple[Hashable, Sequence] | dict[str, Sequence]] + dict[str, Sequence[Any]] + | Sequence[Sequence[Any]] + | Iterable[Sequence[Any] | tuple[Hashable, Sequence[Any]] | dict[str, Sequence[Any]]] | np.ndarray | sps.spmatrix | DataFrame ) XSelector: TypeAlias = XTypes | Callable[..., XTypes] -YTypes: TypeAlias = dict[str, Any] | Sequence | Series | XSelector +YTypes: TypeAlias = dict[str, Any] | Sequence[Any] | XSelector YSelector: TypeAlias = Int | str | YTypes -FHSelector: TypeAlias = int | Sequence | Index | Series | ForecastingHorizon +FHSelector: TypeAlias = int | Sequence[Any] | ForecastingHorizon # Return types for transform methods TReturn: TypeAlias = np.ndarray | sps.spmatrix | Series | DataFrame @@ -203,9 +238,9 @@ def predict(self, *args, **kwargs) -> Pandas: ... CategoricalStrats: TypeAlias = Literal["drop", "most_frequent"] DiscretizerStrats: TypeAlias = Literal["uniform", "quantile", "kmeans", "custom"] Bins: TypeAlias = ( - IntLargerZero + IntLargerOne | Sequence[Scalar] - | dict[str, IntLargerZero | Sequence[Scalar]] + | dict[str, IntLargerOne | Sequence[Scalar]] ) NormalizerStrats: TypeAlias = Literal["yeojohnson", "boxcox", "quantile"] PrunerStrats: TypeAlias = Literal[ @@ -257,3 +292,19 @@ def predict(self, *args, **kwargs) -> Pandas: ... # Mlflow stages Stages: TypeAlias = Literal["None", "Staging", "Production", "Archived"] + + +# Variable types for isinstance ================================== >> + +# Although injecting the type hints directly to isinstance works, mypy fails +# https://github.com/python/mypy/issues/11673 +# https://github.com/python/mypy/issues/16358 +bool_t = (bool, np.bool_) +int_t = (int, np.integer) +float_t = (float, np.floating) +segment_t = (slice, range) +tsindex_t = TSIndex.__args__ +series_t = (pd.Series, md.Series) +sequence_t = (range, list, tuple, np.ndarray, pd.Index, md.Index, pd.Series, md.Series) +dataframe_t = (pd.DataFrame, md.DataFrame) +pandas_t = (pd.Series, md.Series, pd.DataFrame, md.DataFrame) diff --git a/atom/utils/utils.py b/atom/utils/utils.py index 647489699..b6396b9c0 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -13,6 +13,7 @@ import sys import warnings from collections import deque +from collections.abc import Callable, Hashable, Iterator from contextlib import contextmanager from copy import copy from dataclasses import dataclass @@ -23,7 +24,7 @@ from inspect import Parameter, signature from itertools import cycle from types import GeneratorType, MappingProxyType -from typing import TYPE_CHECKING, overload +from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload from unittest.mock import patch import mlflow @@ -33,10 +34,8 @@ import pandas as pd import plotly.graph_objects as go import scipy.sparse as sps +from beartype import beartype from beartype.door import is_bearable -from beartype.typing import ( - Any, Callable, Hashable, Iterator, Literal, Sequence, TypeVar -) from IPython.display import display from matplotlib.colors import to_rgba from mlflow.models.signature import infer_signature @@ -57,8 +56,9 @@ from atom.utils.types import ( Bool, DataFrame, Estimator, Float, Index, IndexSelector, Int, IntLargerEqualZero, MetricConstructor, Model, Pandas, Predictor, Scalar, - Scorer, Segment, Seq1dim, Series, Transformer, TReturn, TReturns, Verbose, - XSelector, YSelector, YTypes, + Scorer, Segment, Sequence, Series, Transformer, TReturn, TReturns, Verbose, + XSelector, YSelector, YTypes, dataframe_t, int_t, pandas_t, segment_t, + sequence_t, series_t, ) @@ -70,6 +70,7 @@ T = TypeVar("T") T_Pandas = TypeVar("T_Pandas", Series, DataFrame) +T_Transformer = TypeVar("T_Transformer", bound=Transformer) # Classes ========================================================== >> @@ -104,22 +105,22 @@ def infer_task(self, y: Pandas) -> Task: """ if self.value == 1: - if isinstance(y, Series): + if isinstance(y, series_t): return Task.regression else: return Task.multioutput_regression elif self.value == 2: - if isinstance(y, Series): + if isinstance(y, series_t): return Task.univariate_forecast else: return Task.multivariate_forecast - if isinstance(y, DataFrame): + if isinstance(y, dataframe_t): if all(y[col].nunique() == 2 for col in y.columns): return Task.multilabel_classification else: return Task.multiclass_multioutput_classification - elif isinstance(y.iloc[0], Seq1dim): + elif isinstance(y.iloc[0], sequence_t): return Task.multilabel_classification elif y.nunique() == 1: raise ValueError(f"Only found 1 target value: {y.unique()[0]}") @@ -257,7 +258,7 @@ def get_stratify_columns(self, df: DataFrame, y: Pandas) -> DataFrame | None: else: inc = [] for col in lst(self.stratify): - if isinstance(col, Int): + if isinstance(col, int_t): if -df.shape[1] <= col <= df.shape[1]: inc.append(df.columns[int(col)]) else: @@ -1037,7 +1038,7 @@ def _conv(key: Any) -> Any: return key.lower() if isinstance(key, str) else key def _get_data(self, key: Any) -> Any: - if isinstance(key, Int) and key not in self.keys(): + if isinstance(key, int_t) and key not in self.keys(): try: return self.__data[key] except IndexError: @@ -1072,17 +1073,15 @@ def __init__(self, *args, key: str = "name"): self.__data.append(self._check(elem)) def __getitem__(self, key: Any) -> Any: - if isinstance(key, Seq1dim): + if isinstance(key, sequence_t): return self.__class__(*[self._get_data(k) for k in key], key=self.__key) - elif isinstance(key, Segment): + elif isinstance(key, segment_t): return self.__class__(*get_segment(self.__data, key), key=self.__key) - elif isinstance(key, slice): - return self.__class__(*self.__data[key], key=self.__key) else: return self._get_data(key) def __setitem__(self, key: Any, value: Any): - if isinstance(key, Int): + if isinstance(key, int_t): self.__data[key] = self._check(value) else: try: @@ -1167,7 +1166,7 @@ def flt(x: Any) -> Any: Object. """ - return x[0] if isinstance(x, Seq1dim) and len(x) == 1 else x + return x[0] if isinstance(x, sequence_t) and len(x) == 1 else x def lst(x: Any) -> list[Any]: @@ -1184,7 +1183,7 @@ def lst(x: Any) -> list[Any]: Item as list with length 1 or provided sequence as list. """ - return list(x) if isinstance(x, dict | Seq1dim | ClassMap) else [x] + return list(x) if isinstance(x, (dict, *sequence_t, ClassMap)) else [x] def it(x: Any) -> Any: @@ -1357,7 +1356,7 @@ def get_nan(dtype: Dtype) -> float | NAType: # Always convert these values default_values = [None, pd.NA, pd.NaT, np.NaN, np.inf, -np.inf] - if isinstance(X, Series): + if isinstance(X, series_t): return X.replace( to_replace=(missing_values or []) + default_values, value=get_nan(X.dtype), @@ -1383,7 +1382,7 @@ def get_cols(elem: Pandas) -> list[Series]: Columns in elem. """ - if isinstance(elem, Series): + if isinstance(elem, series_t): return [elem] else: return [elem[col] for col in elem.columns] @@ -1753,7 +1752,7 @@ def to_pyarrow(column: Series, inverse: bool = False) -> Dtype: ---------- column: series Column to get the dtype from. If it already has a pyarrow - dtype, return original dtype. + dtype, return the original dtype. inverse: bool, default=False Whether to convert to pyarrow or back from pyarrow. @@ -2033,7 +2032,7 @@ def check_attr(attr: str) -> bool: Whether the attribute's value is False or empty. """ - if isinstance(value := getattr(obj, attr), Pandas): + if isinstance(value := getattr(obj, attr), pandas_t): return value.empty else: return not value @@ -2410,6 +2409,8 @@ def transform_one( def prepare_df(out: TReturn, og: DataFrame) -> DataFrame: """Convert to df and set correct column names and order. + If ATOM's data backend="pyarrow", convert the dtypes. + Parameters ---------- out: np.ndarray, sps.matrix, series or dataframe @@ -2427,7 +2428,7 @@ def prepare_df(out: TReturn, og: DataFrame) -> DataFrame: use_cols = [c for c in inc if c in og.columns] # Convert to pandas and assign proper column names - if not isinstance(out, DataFrame): + if not isinstance(out, dataframe_t): if hasattr(transformer, "get_feature_names_out"): columns = transformer.get_feature_names_out() elif hasattr(transformer, "get_feature_names"): @@ -2435,8 +2436,10 @@ def prepare_df(out: TReturn, og: DataFrame) -> DataFrame: columns = transformer.get_feature_names() else: columns = name_cols(out, og, use_cols) + else: + columns = out.columns - out = to_df(out, index=og.index, columns=columns) + out = to_df(out, index=og.index, columns=columns) # Reorder columns if only a subset was used if len(use_cols) != og.shape[1]: @@ -2488,7 +2491,7 @@ def prepare_df(out: TReturn, og: DataFrame) -> DataFrame: name=getattr(yt, "name", None), columns=getattr(yt, "columns", None), ) - if isinstance(yt, DataFrame): + if isinstance(yt, dataframe_t): y_new = prepare_df(y_new, yt) elif "X" in params and X is not None and any(c in Xt for c in inc): # X in -> X out @@ -2502,7 +2505,7 @@ def prepare_df(out: TReturn, og: DataFrame) -> DataFrame: columns=getattr(yt, "columns", None), ) X_new = Xt if Xt is None else Xt.set_index(y_new.index) - if isinstance(yt, DataFrame): + if isinstance(yt, dataframe_t): y_new = prepare_df(y_new, yt) return X_new, y_new @@ -2729,39 +2732,85 @@ def wrapper(*args, **kwargs) -> Any: return wrapper +def wrap_methods(f: Callable) -> Callable: + """Wrap transformer methods with shared code. + + The following operations are always performed: + + - Transform the input to pandas types. + - Check if the instance is fitted before transforming. + - Convert output to pyarrow dtypes if specified in config. + + Parameters + ---------- + f: callable + Function to decorate. + + check_fitted: bool + Whether to check if the instance is fitted. + + """ + + @wraps(f) + @beartype + def wrapper( + self: T_Transformer, + X: XSelector | None = None, + y: YSelector | None = None, + **kwargs, + ) -> T_Transformer | Pandas | tuple[DataFrame, Pandas]: + if f.__name__ == "fit": + Xt, yt = self._check_input(X, y) + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + return f(self, Xt, yt, **kwargs) + else: + if "TransformerMixin" not in str(self.fit): + check_is_fitted(self) + Xt, yt = self._check_input( + X=X, + y=y, + columns=getattr(self, "feature_names_in_", None), + name=getattr(self, "target_names_in_", None), + ) + return f(self, Xt, yt, **kwargs) + + return wrapper + + # Custom scorers =================================================== >> -def true_negatives(y_true: Sequence, y_pred: Sequence) -> Int: +def true_negatives(y_true: Sequence[Int], y_pred: Sequence[Int]) -> Int: return confusion_matrix(y_true, y_pred).ravel()[0] -def false_positives(y_true: Sequence, y_pred: Sequence) -> Int: +def false_positives(y_true: Sequence[Int], y_pred: Sequence[Int]) -> Int: return confusion_matrix(y_true, y_pred).ravel()[1] -def false_negatives(y_true: Sequence, y_pred: Sequence) -> Int: +def false_negatives(y_true: Sequence[Int], y_pred: Sequence[Int]) -> Int: return confusion_matrix(y_true, y_pred).ravel()[2] -def true_positives(y_true: Sequence, y_pred: Sequence) -> Int: +def true_positives(y_true: Sequence[Int], y_pred: Sequence[Int]) -> Int: return confusion_matrix(y_true, y_pred).ravel()[3] -def false_positive_rate(y_true: Sequence, y_pred: Sequence) -> Float: +def false_positive_rate(y_true: Sequence[Int], y_pred: Sequence[Int]) -> Float: tn, fp, _, _ = confusion_matrix(y_true, y_pred).ravel() return fp / (fp + tn) -def true_positive_rate(y_true: Sequence, y_pred: Sequence) -> Float: +def true_positive_rate(y_true: Sequence[Int], y_pred: Sequence[Int]) -> Float: _, _, fn, tp = confusion_matrix(y_true, y_pred).ravel() return tp / (tp + fn) -def true_negative_rate(y_true: Sequence, y_pred: Sequence) -> Float: +def true_negative_rate(y_true: Sequence[Int], y_pred: Sequence[Int]) -> Float: tn, fp, _, _ = confusion_matrix(y_true, y_pred).ravel() return tn / (tn + fp) -def false_negative_rate(y_true: Sequence, y_pred: Sequence) -> Float: +def false_negative_rate(y_true: Sequence[Int], y_pred: Sequence[Int]) -> Float: _, _, fn, tp = confusion_matrix(y_true, y_pred).ravel() return fn / (fn + tp) diff --git a/docs_sources/scripts/autodocs.py b/docs_sources/scripts/autodocs.py index 553d7177f..9e1634bd7 100644 --- a/docs_sources/scripts/autodocs.py +++ b/docs_sources/scripts/autodocs.py @@ -17,7 +17,8 @@ Parameter, getdoc, getmembers, getsourcelines, isclass, isfunction, ismethod, isroutine, signature, ) -from beartype.typing import Any, Callable, Optional +from typing import Any, Optional +from collections.abc import Callable import regex as re import yaml diff --git a/docs_sources/user_guide/nomenclature.md b/docs_sources/user_guide/nomenclature.md index b6ddf1c8f..ef758c094 100644 --- a/docs_sources/user_guide/nomenclature.md +++ b/docs_sources/user_guide/nomenclature.md @@ -119,6 +119,14 @@ range of values. When given as a parameter type, it includes both and [slice](https://docs.python.org/3/library/functions.html#slice). +
sequence
+
+A one-dimensional, indexable array of type [sequence](https://docs.python.org/3/library/stdtypes.html#sequence-types-list-tuple-range) +(except string), [np.ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html), +[index][] or [series][]. This is the standard input format for a dataset's target +column. +
+
series
One-dimensional ndarray with axis labels of type @@ -127,14 +135,6 @@ or its [modin](https://modin.readthedocs.io/en/stable/flow/modin/pandas/series.h counterpart.
-
series-like
-
-A one-dimensional, indexable array of type [sequence](https://docs.python.org/3/library/stdtypes.html#sequence-types-list-tuple-range), -[np.ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html), -[index][] or [series][]. This is the standard input format for a dataset's target -column. -
-
target
The dependent variable in a supervised learning task. Passed as `y` to diff --git a/tests/conftest.py b/tests/conftest.py index a4e55a4de..33567403c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,12 +10,12 @@ from __future__ import annotations from pathlib import Path +from typing import Any import numpy as np import pandas as pd import pytest from _pytest.monkeypatch import MonkeyPatch -from beartype.typing import Any, Sequence from sklearn.base import BaseEstimator from sklearn.datasets import ( load_breast_cancer, load_diabetes, load_wine, @@ -26,7 +26,7 @@ from sktime.datasets import load_airline, load_longley from sktime.split import temporal_train_test_split -from atom.utils.types import DataFrame, Pandas, XSelector +from atom.utils.types import DataFrame, Pandas, Sequence, XSelector from atom.utils.utils import merge, n_cols, to_df, to_pandas diff --git a/tests/test_api.py b/tests/test_api.py index 1f6725b2e..bddd8100b 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -24,7 +24,7 @@ def test_atommodel(): ) atom = ATOMRegressor(X_reg, y_reg, random_state=1) - atom.run(model, errors="raise") + atom.run(model) assert model is not huber # Is cloned assert model.name == "huber1" assert model.acronym == "huber" diff --git a/tests/test_atom.py b/tests/test_atom.py index ae7c1fada..53195b34b 100644 --- a/tests/test_atom.py +++ b/tests/test_atom.py @@ -603,8 +603,9 @@ def func_test(df): def test_add_sparse_matrices(): """Assert that transformers that return sp.matrix are accepted.""" + ohe = OneHotEncoder(handle_unknown="ignore").set_output(transform="default") atom = ATOMClassifier(X10_str, y10, shuffle=False, random_state=1) - atom.add(OneHotEncoder(handle_unknown="ignore"), columns=2) + atom.add(ohe, columns=2) assert atom.shape == (10, 8) # Creates 4 extra columns @@ -640,6 +641,14 @@ def test_raise_length_mismatch(): atom.prune(columns=[2, 4]) +def test_add_pyarrow_columns(): + """Assert that columns keep the pyarrow dtype.""" + atom = ATOMClassifier(X_bin, y_bin, engine={"data": "pyarrow"}, random_state=1) + assert isinstance(atom.dtypes[0], pd.ArrowDtype) + atom.scale() + assert isinstance(atom.dtypes[0], pd.ArrowDtype) + + def test_add_derivative_columns_keep_position(): """Assert that derivative columns go after the original.""" atom = ATOMClassifier(X10_str, y10, random_state=1) diff --git a/tests/test_baserunner.py b/tests/test_baserunner.py index 7aa75011b..569463a87 100644 --- a/tests/test_baserunner.py +++ b/tests/test_baserunner.py @@ -138,7 +138,7 @@ def test_getitem_no_dataset(): def test_getitem_int(): """Assert that getitem works for a column index.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) - assert atom[0] is atom["mean radius"] + assert_frame_equal(atom[0], atom["mean radius"]) def test_getitem_str_from_branch(): diff --git a/tests/test_basetransformer.py b/tests/test_basetransformer.py index ac9242eb2..3562fb7f9 100644 --- a/tests/test_basetransformer.py +++ b/tests/test_basetransformer.py @@ -161,12 +161,6 @@ def test_experiment_dagshub(dagshub, request, token, _): assert "dagshub" not in mlflow.get_tracking_uri() -def test_random_state_negative_int(): - """Assert that an error is raised for a negative random_state.""" - with pytest.raises(ValueError, match=".*random_state parameter.*"): - BaseTransformer(random_state=-1) - - def test_device_id_no_value(): """Assert that the device id can be left empty.""" base = BaseTransformer(device="gpu") diff --git a/tests/test_data_cleaning.py b/tests/test_data_cleaning.py index a0a89376c..41c907992 100644 --- a/tests/test_data_cleaning.py +++ b/tests/test_data_cleaning.py @@ -13,6 +13,7 @@ from category_encoders.target_encoder import TargetEncoder from imblearn.combine import SMOTETomek from pandas.testing import assert_frame_equal, assert_series_equal +from sklearn.base import clone from sklearn.preprocessing import StandardScaler from atom.data_cleaning import ( @@ -29,6 +30,18 @@ # Test TransformerMixin ============================================ >> +def test_clone(): + """Assert that cloning the transformer keeps internal attributes.""" + pruner = Pruner().fit(X_bin) + pruner._cols = [0] + assert hasattr(clone(pruner), "_cols") + + +def test_transform_check_is_fitted(): + """Assert that an error is raised when not fitted.""" + pytest.raises(NotFittedError, Scaler().transform, X_bin) + + def test_fit_transform(): """Assert that the fit_transform method works as intended.""" X_1 = Scaler().fit_transform(X_bin) @@ -36,12 +49,6 @@ def test_fit_transform(): assert_frame_equal(X_1, X_2) -def test_fit_transform_no_fit(): - """Assert that the fit_transform method works when no fit method.""" - X, y = Balancer().fit_transform(X_bin, y_bin) - assert len(X) > len(X_bin) - - def test_inverse_transform(): """Assert that the inverse_transform returns the data unchanged.""" encoder = Encoder().fit(X_bin) @@ -117,8 +124,6 @@ def test_undersampling_keeps_indices(): def test_combinations_numerical_index(): """Assert that new samples have an increasing int index.""" X, y = Balancer(strategy="smoteenn").fit_transform(X_bin, y_bin) - print(X_bin) - print(X) assert not all(idx in X.index for idx in X_bin.index) # Samples were dropped assert max(X.index) > max(X_bin.index) # Samples were added @@ -255,11 +260,12 @@ def test_cleaner_target_mapping_binary(): # Test Discretizer ================================================= >> -def test_invalid_bins_missing_column(): - """Assert that an error is raised when a column is missing.""" - discretizer = Discretizer(strategy="uniform", bins={"invalid": 5}) - with pytest.raises(ValueError, match=".*not found in the dictionary.*"): - discretizer.fit(X_bin) +def test_missing_columns_in_dict_are_ignored(): + """Assert that only columns in the dict are transformed.""" + discretizer = Discretizer(strategy="uniform", bins={"mean radius": 5}) + X = discretizer.fit_transform(X_bin) + assert X["mean radius"].dtype.kind == "O" + assert X["mean texture"].dtype.kind == "f" def test_invalid_bins_custom_strategy(): @@ -375,11 +381,6 @@ def test_encoder_custom_estimator(): assert X.at[0, "x2"] != "a" -def test_encoder_check_is_fitted(): - """Assert that an error is raised if the instance is not fitted.""" - pytest.raises(NotFittedError, Encoder().transform, X_bin, y_bin) - - def test_missing_values_are_propagated(): """Assert that missing values are propagated.""" encoder = Encoder(max_onehot=None) @@ -432,11 +433,6 @@ def test_kwargs_parameters(): # Test Imputer ===================================================== >> -def test_imputer_check_is_fitted(): - """Assert that an error is raised if the instance is not fitted.""" - pytest.raises(NotFittedError, Imputer().transform, X_bin, y_bin) - - @pytest.mark.parametrize("missing", [None, np.NaN, np.inf, -np.inf, 99]) def test_imputing_all_missing_values_numeric(missing): """Assert that all missing values are imputed in numeric columns.""" @@ -573,11 +569,6 @@ def test_imputing_non_numeric_most_frequent(): # Test Normalizer ======================================================= >> -def test_normalizer_check_is_fitted(): - """Assert that an error is raised when not fitted.""" - pytest.raises(NotFittedError, Normalizer().transform, X_bin) - - @pytest.mark.parametrize("strategy", ["yeojohnson", "boxcox", "quantile"]) def test_normalizer_all_strategies(strategy): """Assert that all strategies work as intended.""" @@ -730,11 +721,6 @@ def test_pruner_attach_attribute(): # Test Scaler ====================================================== >> -def test_scaler_check_is_fitted(): - """Assert that an error is raised when not fitted.""" - pytest.raises(NotFittedError, Scaler().transform, X_bin) - - @pytest.mark.parametrize("strategy", ["standard", "minmax", "maxabs", "robust"]) def test_scaler_all_strategies(strategy): """Assert that all strategies work as intended.""" diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py index 903d06a73..17e10f2d1 100644 --- a/tests/test_feature_engineering.py +++ b/tests/test_feature_engineering.py @@ -251,7 +251,7 @@ def test_error_y_is_None(): @pytest.mark.parametrize("min_repeated", [2, 0.1]) def test_remove_high_variance(min_repeated): - """Assert that high variance features are removed.""" + """Assert that high-variance features are removed.""" X = X_bin.copy() X["invalid"] = [f"{i}" for i in range(len(X))] # Add column with maximum variance selector = FeatureSelector(min_repeated=min_repeated, max_repeated=None) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index ecee580b3..5b16e8d4a 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -21,7 +21,7 @@ @pytest.fixture def pipeline(): - """Get a pipeline from atom with/without final estimator.""" + """Get a pipeline from atom with/without a final estimator.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) atom.clean() atom.impute() @@ -37,7 +37,7 @@ def get_pipeline(model): def test_getattr(pipeline): """Assert that attributes can be fetched from the final estimator.""" - pl = pipeline(model=True).fit(X_bin, y_bin) + pl = pipeline(model=True) assert isinstance(pl.coef_, np.ndarray) # Final estimator has no attribute @@ -56,7 +56,6 @@ def test_fit(pipeline): def test_internal_attrs_are_saved(pipeline): """Assert that cols and train_only attrs are stored after clone.""" pl = pipeline(model=False) - pl.fit(X_bin, y_bin) assert pl.steps[-1][1]._cols == ["mean radius"] assert pl.steps[-2][1]._train_only is True