diff --git a/atom/_show_versions.py b/atom/_show_versions.py index ed013853e..56dfcbdc9 100644 --- a/atom/_show_versions.py +++ b/atom/_show_versions.py @@ -20,12 +20,11 @@ "atom", "beartype", "category_encoders", - "dagshub", "dill", + "featuretools", "gplearn", "imblearn", "ipywidgets", - "featuretools", "joblib", "matplotlib", "mlflow", @@ -35,14 +34,31 @@ "optuna", "pandas", "plotly", - "ray", - "requests", "sklearn", - "sklearnex", # Has no __version__ attribute "scipy", "shap", "sktime", + "statsmodels", "zoofs", # Has no __version__ attribute + "botorch", + "catboost", + "dagshub", + "dask[distributed]", + "explainerdashboard", + "gradio", + "lightgbm", + "modin[ray]", + "polars", + "pyarrow", + "pyspark", + "ray[serve]", + "requests", + "sklearnex", + "schemdraw", + "statsforecast", + "sweetviz", + "wordcloud", + "xgboost", ] diff --git a/atom/api.py b/atom/api.py index 6bb22eaaf..31dc391c1 100644 --- a/atom/api.py +++ b/atom/api.py @@ -158,20 +158,20 @@ class ATOMClassifier(ATOM): **X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features). - **y: int, str or sequence**
- Target column corresponding to `X`. + **y: int, str, sequence or dataframe-like**
+ Target column(s) corresponding to `X`. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. - y: int, str, dict, sequence or dataframe, default=-1 - Target column corresponding to `X`. + y: int, str, sequence or dataframe-like, default=-1 + Target column(s) corresponding to `X`. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - If dataframe: Target columns for multioutput tasks. @@ -257,9 +257,16 @@ class ATOMClassifier(ATOM): - "data": + - "numpy" - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" - "pyarrow" - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" - "estimator": @@ -276,6 +283,7 @@ class ATOMClassifier(ATOM): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -428,24 +436,24 @@ class ATOMForecaster(ATOM): Exogenous feature set corresponding to y, with shape=(n_samples, n_features). - **y: int, str or sequence**
+ **y: int, str, sequence or dataframe-like**
Time series. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. - y: int, str, dict, sequence or dataframe, default=-1 + y: int, str, sequence or dataframe-like, default=-1 Time series. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. This parameter is ignored if the time series is provided through `arrays`. @@ -526,9 +534,16 @@ class ATOMForecaster(ATOM): - "data": + - "numpy" - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" - "pyarrow" - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" - "estimator": @@ -545,6 +560,7 @@ class ATOMForecaster(ATOM): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -689,24 +705,24 @@ class ATOMRegressor(ATOM): **X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features). - **y: int, str or sequence**
- Target column corresponding to `X`. + **y: int, str, sequence or dataframe-like**
+ Target column(s) corresponding to `X`. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - If dataframe: Target columns for multioutput tasks. - y: int, str, dict, sequence or dataframe, default=-1 - Target column corresponding to `X`. + y: int, str, sequence or dataframe-like, default=-1 + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. This parameter is ignored if the target column is provided through `arrays`. @@ -775,9 +791,16 @@ class ATOMRegressor(ATOM): - "data": + - "numpy" - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" - "pyarrow" - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" - "estimator": @@ -794,6 +817,7 @@ class ATOMRegressor(ATOM): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the diff --git a/atom/atom.py b/atom/atom.py index 37abdd8ab..8b6c600de 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -27,12 +27,10 @@ from scipy import stats from sklearn.pipeline import Pipeline as SkPipeline from sklearn.utils.metaestimators import available_if -from statsmodels.stats.diagnostic import acorr_ljungbox -from statsmodels.tsa.stattools import adfuller, kpss from atom.baserunner import BaseRunner from atom.basetransformer import BaseTransformer -from atom.branch import Branch, BranchManager +from atom.data import Branch, BranchManager from atom.data_cleaning import ( Balancer, Cleaner, Decomposer, Discretizer, Encoder, Imputer, Normalizer, Pruner, Scaler, TransformerMixin, @@ -50,22 +48,21 @@ ) from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING, __version__ from atom.utils.types import ( - Backend, Bins, Bool, CategoricalStrats, ColumnSelector, DataFrame, - DiscretizerStrats, Engine, EngineTuple, Estimator, FeatureNamesOut, - FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero, - FloatLargerZero, FloatZeroToOneInc, Index, IndexSelector, Int, - IntLargerEqualZero, IntLargerTwo, IntLargerZero, MetricConstructor, - ModelsConstructor, NItems, NJobs, NormalizerStrats, NumericalStrats, - Operators, Pandas, Predictor, PrunerStrats, RowSelector, Scalar, - ScalerStrats, Seasonality, Sequence, Series, SPDict, TargetSelector, - Transformer, VectorizerStarts, Verbose, Warnings, XSelector, YSelector, - sequence_t, + Backend, Bins, Bool, CategoricalStrats, ColumnSelector, DiscretizerStrats, + Engine, EngineTuple, Estimator, FeatureNamesOut, FeatureSelectionSolvers, + FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, + FloatZeroToOneInc, IndexSelector, Int, IntLargerEqualZero, IntLargerTwo, + IntLargerZero, MetricConstructor, ModelsConstructor, NItems, NJobs, + NormalizerStrats, NumericalStrats, Operators, Predictor, PrunerStrats, + RowSelector, Scalar, ScalerStrats, Seasonality, Sequence, SPDict, + TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings, XReturn, + XSelector, YReturn, YSelector, sequence_t, ) from atom.utils.utils import ( - ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, bk, - check_dependency, check_scaling, composed, crash, fit_one, flt, get_cols, - get_custom_scorer, has_task, is_sparse, lst, make_sklearn, merge, - method_to_log, replace_missing, sign, to_pyarrow, + ClassMap, DataConfig, DataContainer, Goal, adjust, check_dependency, + composed, crash, fit_one, flt, get_cols, get_custom_scorer, has_task, + is_sparse, lst, make_sklearn, merge, method_to_log, n_cols, + replace_missing, sign, ) @@ -156,9 +153,8 @@ def __init__( self._log(f"Parallel processing with {self.n_jobs} cores.", 1) elif self.backend != "loky": self._log( - "Leaving n_jobs=1 ignores all parallelization. Set n_jobs>1 to make use " - f"of the {self.backend} parallelization backend.", - 1, + "Leaving n_jobs=1 ignores all parallelization. Set n_jobs>1 to " + f"make use of the {self.backend} parallelization backend.", 1, severity="warning", ) if "cpu" not in self.device.lower(): @@ -167,7 +163,7 @@ def __init__( self._log(f"Data engine: {self.engine.data}", 1) if self.engine.estimator != EngineTuple().estimator: self._log(f"Estimator engine: {self.engine.estimator}", 1) - if self.backend == "ray" or self.n_jobs > 1: + if self.backend != "loky" and self.n_jobs > 1: self._log(f"Parallelization backend: {self.backend}", 1) if self.memory.location is not None: self._log(f"Cache storage: {os.path.join(self.memory.location, 'joblib')}", 1) @@ -315,27 +311,28 @@ def missing(self, value: Sequence[Any]): def scaled(self) -> bool: """Whether the feature set is scaled. - A data set is considered scaled when it has mean=0 and std=1, - or when there is a scaler in the pipeline. Binary columns (only - zeros and ones) are excluded from the calculation. + A data set is considered scaled when it has mean~0 and std~1, + or when there is a scaler in the pipeline. Categorical and + binary columns (only zeros and ones) are excluded from the + calculation. """ - return check_scaling(self.X, pipeline=self.pipeline) + return self.branch.check_scaling() @property - def duplicates(self) -> Int: + def duplicates(self) -> int: """Number of duplicate rows in the dataset.""" - return self.branch.dataset.duplicated().sum() + return int(self.branch.dataset.duplicated().sum()) @property - def nans(self) -> Series: + def nans(self) -> pd.Series: """Columns with the number of missing values in them. This property is unavailable for [sparse datasets][]. """ - if not is_sparse(self.X): - return replace_missing(self.X, self.missing).isna().sum() + if not is_sparse(self.branch.X): + return replace_missing(self.branch.X, self.missing).isna().sum() raise AttributeError("This property is unavailable for sparse datasets.") @@ -346,16 +343,16 @@ def n_nans(self) -> int: This property is unavailable for [sparse datasets][]. """ - if not is_sparse(self.X): - nans = replace_missing(self.X, self.missing).isna().sum(axis=1) + if not is_sparse(self.branch.X): + nans = replace_missing(self.branch.X, self.missing).isna().sum(axis=1) return len(nans[nans > 0]) raise AttributeError("This property is unavailable for sparse datasets.") @property - def numerical(self) -> Index: + def numerical(self) -> list[str]: """Names of the numerical features in the dataset.""" - return self.X.select_dtypes(include=["number"]).columns + return list(self.branch.X.select_dtypes(include=["number"]).columns) @property def n_numerical(self) -> int: @@ -363,9 +360,9 @@ def n_numerical(self) -> int: return len(self.numerical) @property - def categorical(self) -> Index: + def categorical(self) -> list[str]: """Names of the categorical features in the dataset.""" - return self.X.select_dtypes(include=CAT_TYPES).columns + return list(self.branch.X.select_dtypes(include=CAT_TYPES).columns) @property def n_categorical(self) -> int: @@ -379,7 +376,7 @@ def outliers(self) -> pd.Series: This property is unavailable for [sparse datasets][]. """ - if not is_sparse(self.X): + if not is_sparse(self.branch.X): data = self.branch.train.select_dtypes(include=["number"]) z_scores = np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3 z_scores = pd.Series(z_scores.sum(axis=0), index=data.columns) @@ -388,16 +385,16 @@ def outliers(self) -> pd.Series: raise AttributeError("This property is unavailable for sparse datasets.") @property - def n_outliers(self) -> Int: + def n_outliers(self) -> int: """Number of samples in the training set containing outliers. This property is unavailable for [sparse datasets][]. """ - if not is_sparse(self.X): + if not is_sparse(self.branch.X): data = self.branch.train.select_dtypes(include=["number"]) z_scores = np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3 - return z_scores.any(axis=1).sum() + return int(z_scores.any(axis=1).sum()) raise AttributeError("This property is unavailable for sparse datasets.") @@ -429,14 +426,14 @@ def classes(self) -> pd.DataFrame: raise AttributeError("This property is unavailable for regression tasks.") @property - def n_classes(self) -> Int | Series: + def n_classes(self) -> Int | pd.Series: """Number of classes in the target column(s). This property is only available for classification tasks. """ if self.task.is_classification: - return self.y.nunique(dropna=False) + return self.branch.y.nunique(dropna=False) raise AttributeError("This property is unavailable for regression tasks.") @@ -482,6 +479,9 @@ def checks(self, *, columns: ColumnSelector | None = None) -> pd.DataFrame: - **p_value:** Corresponding p-value. """ + from statsmodels.stats.diagnostic import acorr_ljungbox + from statsmodels.tsa.stattools import adfuller, kpss + columns_c = self.branch._get_columns(columns, only_numerical=True) df = pd.DataFrame( @@ -500,7 +500,8 @@ def checks(self, *, columns: ColumnSelector | None = None) -> pd.DataFrame: if test == "adf": stat = adfuller(X, maxlag=None, autolag="AIC") elif test == "kpss": - stat = kpss(X, regression="ct", nlags="auto") # ct is trend stationarity + # regression='ct' is trend stationarity + stat = kpss(X, regression="ct", nlags="auto") elif test == "lb": l_jung = acorr_ljungbox(X, lags=None, period=lst(self.sp.sp)[0]) stat = l_jung.loc[l_jung["lb_pvalue"].idxmin()] @@ -671,7 +672,7 @@ def inverse_transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Inversely transform new data through the pipeline. Transformers that are only applied on the training set are @@ -682,20 +683,18 @@ def inverse_transform( Parameters ---------- - X: dataframe-like or None, default=None - Transformed feature set with shape=(n_samples, n_features). - If None, X is ignored in the transformers. + X: Transformed feature set with shape=(n_samples, n_features). + If None, `X` is ignored in the transformers. - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: int, str, sequence, dataframe-like or None, default=None + Transformed target column corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If @@ -710,10 +709,10 @@ def inverse_transform( Original target column. Only returned if provided. """ - X, y = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) + Xt, yt = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) - with adjust_verbosity(self.pipeline, verbose) as pipeline: - return pipeline.inverse_transform(X, y) + with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl: + return pl.inverse_transform(Xt, yt) @classmethod def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM: @@ -749,12 +748,11 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM **X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features). - **y: int, str or sequence**
- Target column corresponding to `X`. + **y: int, str, sequence or dataframe**
+ Target column(s) corresponding to `X`. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -815,7 +813,7 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM X_test, y_test = branch.pipeline.transform(branch.X_test, branch.y_test) # Update complete dataset - branch._container.data = bk.concat( + branch._container.data = pd.concat( [merge(X_train, y_train), merge(X_test, y_test)] ) @@ -824,7 +822,7 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM data=(dataset := branch._container.data.reset_index(drop=True)), train_idx=dataset.index[:len(branch._container.train_idx)], test_idx=dataset.index[-len(branch._container.test_idx):], - n_cols=branch._container.n_cols, + n_targets=branch._container.n_targets, ) # Store inactive branches in memory @@ -929,7 +927,7 @@ def shrink( """ - def get_data(new_t: DtypeObj) -> Series: + def get_data(new_t: DtypeObj) -> pd.Series: """Get the series with the right data format. Also converts to sparse format if `dense2sparse=True`. @@ -941,7 +939,7 @@ def get_data(new_t: DtypeObj) -> Series: Returns ------- - series + pd.Series Object with the new data type. """ @@ -975,9 +973,6 @@ def get_data(new_t: DtypeObj) -> Series: data = self.branch.dataset[self.branch._get_columns(columns)] - # Convert back since convert_dtypes doesn't work properly for pyarrow dtypes - data = data.astype({n: to_pyarrow(c, inverse=True) for n, c in data.items()}) - # Convert to the best nullable dtype data = data.convert_dtypes() @@ -1012,11 +1007,6 @@ def get_data(new_t: DtypeObj) -> Series: get_data(r[0]) for r in t if r[1] <= column.min() and r[2] >= column.max() ) - if self.engine.data == "pyarrow": - self.branch.dataset = self.dataset.astype( - {name: to_pyarrow(col) for name, col in self.dataset.items()} - ) - self._log("The column dtypes are successfully converted.", 1) @composed(crash, method_to_log) @@ -1030,26 +1020,26 @@ def stats(self, _vb: Int = -2, /): """ self._log("Dataset stats " + "=" * 20 + " >>", _vb) - self._log(f"Shape: {self.shape}", _vb) + self._log(f"Shape: {self.branch.shape}", _vb) if self.task.is_forecast and self.sp.sp: self._log(f"Seasonal period: {self.sp.sp}", _vb) for ds in ("train", "test", "holdout"): - if (data := getattr(self, ds)) is not None: + if (data := getattr(self.branch, ds)) is not None: self._log(f"{ds.capitalize()} set size: {len(data)}", _vb) if self.task.is_forecast: self._log(f" --> From: {min(data.index)} To: {max(data.index)}", _vb) self._log("-" * 37, _vb) - if (memory := self.dataset.memory_usage().sum()) < 1e6: + if (memory := self.branch.dataset.memory_usage().sum()) < 1e6: self._log(f"Memory: {memory / 1e3:.2f} kB", _vb) else: self._log(f"Memory: {memory / 1e6:.2f} MB", _vb) - if is_sparse(self.X): + if is_sparse(self.branch.X): self._log("Sparse: True", _vb) - if hasattr(self.X, "sparse"): # All columns are sparse - self._log(f"Density: {100. * self.X.sparse.density:.2f}%", _vb) + if hasattr(self.branch.X, "sparse"): # All columns are sparse + self._log(f"Density: {100. * self.branch.X.sparse.density:.2f}%", _vb) else: # Not all columns are sparse n_sparse = sum(isinstance(self[c].dtype, pd.SparseDtype) for c in self.features) n_dense = self.n_features - n_sparse @@ -1062,7 +1052,7 @@ def stats(self, _vb: Int = -2, /): n_categorical = self.n_categorical outliers = self.outliers.sum() try: # Can fail for unhashable columns (e.g., multilabel with lists) - duplicates = self.dataset.duplicated().sum() + duplicates = self.branch.dataset.duplicated().sum() except TypeError: duplicates = None self._log( @@ -1071,7 +1061,7 @@ def stats(self, _vb: Int = -2, /): 3, ) - if not self.X.empty: + if not self.branch.X.empty: self._log(f"Scaled: {self.scaled}", _vb) if nans: p_nans = round(100 * nans / self.branch.dataset.size, 1) @@ -1103,31 +1093,29 @@ def transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Transform new data through the pipeline. Transformers that are only applied on the training set are skipped. If only `X` or only `y` is provided, it ignores transformers that require the other parameter. This can be - of use to, for example, transform only the target column. + of use to, for example, transform only the target column. Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. If None, - X is ignored in the transformers. + `X` is ignored. - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: int, str, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If @@ -1142,10 +1130,10 @@ def transform( Transformed target column. Only returned if provided. """ - X, y = self._check_input(X, y, columns=self.og.features, name=self.og.target) + Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) - with adjust_verbosity(self.pipeline, verbose) as pipeline: - return pipeline.transform(X, y) + with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl: + return pl.transform(Xt, yt) # Base transformers ============================================ >> @@ -1153,11 +1141,15 @@ def _prepare_kwargs( self, kwargs: dict[str, Any], params: MappingProxyType | None = None, + *, + is_runner: Bool = False, ) -> dict[str, Any]: """Return kwargs with atom's values if not specified. This method is used for all transformers and runners to pass - atom's BaseTransformer's properties to the classes. + atom's BaseTransformer's properties to the classes. The engine + parameter is the only one that is modified for non-runners + since ATOM's transformers only accept the estimator engine. Parameters ---------- @@ -1167,6 +1159,9 @@ def _prepare_kwargs( params: mappingproxy or None, default=None Parameters in the class' signature. + is_runner: bool, default=False + Whether the params are passed to a runner. + Returns ------- dict @@ -1175,7 +1170,12 @@ def _prepare_kwargs( """ for attr in BaseTransformer.attrs: if (not params or attr in params) and attr not in kwargs: - kwargs[attr] = getattr(self, attr) + if attr == "engine" and not is_runner: + # Engine parameter is special since we don't + # want to change data engines in the pipeline + kwargs[attr] = getattr(self, attr).estimator + else: + kwargs[attr] = getattr(self, attr) return kwargs @@ -1276,8 +1276,8 @@ def _add_transformer( fit = self._memory.cache(fit_one) kwargs = { "estimator": transformer_c, - "X": self.X_train, - "y": self.y_train, + "X": self.branch.X_train, + "y": self.branch.y_train, **fit_params, } @@ -1296,35 +1296,45 @@ def _add_transformer( self._branches.add("og") if transformer_c._train_only: - X, y = self.pipeline._mem_transform(transformer_c, self.X_train, self.y_train) - self.train = merge( - self.X_train if X is None else X, - self.y_train if y is None else y, + X, y = self.pipeline._mem_transform( + transformer=transformer_c, + X=self.branch.X_train, + y=self.branch.y_train, + ) + + self.branch.train = merge( + self.branch.X_train if X is None else X, + self.branch.y_train if y is None else y, ) + else: - X, y = self.pipeline._mem_transform(transformer_c, self.X, self.y) - data = merge(self.X if X is None else X, self.y if y is None else y) + X, y = self.pipeline._mem_transform(transformer_c, self.branch.X, self.branch.y) + data = merge(self.branch.X if X is None else X, self.branch.y if y is None else y) # y can change the number of columns or remove rows -> reassign index - self.branch._container = DataContainer( - data=data, - train_idx=self.branch._data.train_idx.intersection(data.index), - test_idx=self.branch._data.test_idx.intersection(data.index), - n_cols=self.branch._data.n_cols if y is None else len(get_cols(y)), + self._branches.fill( + DataContainer( + data=data, + train_idx=self.branch._data.train_idx.intersection(data.index), + test_idx=self.branch._data.test_idx.intersection(data.index), + n_targets=self.branch._data.n_targets if y is None else n_cols(y), + ) ) if self._config.index is False: - self.branch._container = DataContainer( - data=(data := self.dataset.reset_index(drop=True)), - train_idx=data.index[: len(self.branch._data.train_idx)], - test_idx=data.index[-len(self.branch._data.test_idx):], - n_cols=self.branch._data.n_cols, + self._branches.fill( + DataContainer( + data=(data := self.branch.dataset.reset_index(drop=True)), + train_idx=data.index[: len(self.branch._data.train_idx)], + test_idx=data.index[-len(self.branch._data.test_idx):], + n_targets=self.branch._data.n_targets, + ) ) if self.branch._holdout is not None: - self.branch._holdout.index = range( - len(data), len(data) + len(self.branch._holdout) + self.branch._holdout.index = pd.Index( + range(len(data), len(data) + len(self.branch._holdout)) ) - elif self.dataset.index.duplicated().any(): + elif self.branch.dataset.index.duplicated().any(): raise ValueError( "Duplicate indices found in the dataset. " "Try initializing atom using `index=False`." @@ -1452,8 +1462,8 @@ def add( @composed(crash, method_to_log) def apply( self, - func: Callable[..., DataFrame], - inverse_func: Callable[..., DataFrame] | None = None, + func: Callable[..., pd.DataFrame], + inverse_func: Callable[..., pd.DataFrame] | None = None, *, feature_names_out: FeatureNamesOut = None, kw_args: dict[str, Any] | None = None, @@ -1477,8 +1487,8 @@ def apply( Parameters ---------- func: callable - Function to apply with signature `func(dataset, **kw_args) -> - dataset`. + Function to apply with signature `func(dataframe, **kw_args) + -> dataframe-like`. inverse_func: callable or None, default=None Inverse function of `func`. If None, the inverse_transform @@ -1729,8 +1739,8 @@ def encode( @composed(crash, method_to_log) def impute( self, - strat_num: Scalar | NumericalStrats = "drop", - strat_cat: str | CategoricalStrats = "drop", + strat_num: Scalar | NumericalStrats = "mean", + strat_cat: str | CategoricalStrats = "most_frequent", *, max_nan_rows: FloatLargerZero | None = None, max_nan_cols: FloatLargerZero | None = None, @@ -2215,7 +2225,7 @@ def _run(self, trainer: BaseRunner): Instance that does the actual model training. """ - if any(col.dtype.kind not in "ifu" for col in get_cols(self.y)): + if any(col.dtype.kind not in "ifu" for col in get_cols(self.branch.y)): raise ValueError( "The target column is not numerical. Use atom.clean() " "to encode the target column to numerical values." @@ -2289,7 +2299,7 @@ def run( n_bootstrap=n_bootstrap, parallel=parallel, errors=errors, - **self._prepare_kwargs(kwargs), + **self._prepare_kwargs(kwargs, is_runner=True), ) ) @@ -2351,7 +2361,7 @@ class for a description of the parameters. n_bootstrap=n_bootstrap, parallel=parallel, errors=errors, - **self._prepare_kwargs(kwargs), + **self._prepare_kwargs(kwargs, is_runner=True), ) ) @@ -2411,6 +2421,6 @@ class for a description of the parameters. n_bootstrap=n_bootstrap, parallel=parallel, errors=errors, - **self._prepare_kwargs(kwargs), + **self._prepare_kwargs(kwargs, is_runner=True), ) ) diff --git a/atom/basemodel.py b/atom/basemodel.py index 584574595..170940b93 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -15,7 +15,7 @@ from importlib import import_module from logging import Logger from pathlib import Path -from typing import Any, Literal, overload +from typing import TYPE_CHECKING, Any, Literal, cast, overload from unittest.mock import patch import dill as pickle @@ -23,7 +23,6 @@ import numpy as np import optuna import pandas as pd -import ray from beartype import beartype from joblib.memory import Memory from joblib.parallel import Parallel, delayed @@ -37,7 +36,6 @@ from optuna.study import Study from optuna.terminator import report_cross_validation_scores from optuna.trial import FrozenTrial, Trial, TrialState -from ray import serve from sklearn.base import clone from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import roc_curve @@ -57,30 +55,33 @@ from sktime.performance_metrics.forecasting import make_forecasting_scorer from sktime.proba.normal import Normal from sktime.split import ExpandingWindowSplitter, SingleWindowSplitter -from starlette.requests import Request -from atom.branch import Branch, BranchManager +from atom.data import Branch, BranchManager from atom.data_cleaning import Scaler from atom.pipeline import Pipeline from atom.plots import RunnerPlot from atom.utils.constants import DF_ATTRS from atom.utils.patches import fit_and_score from atom.utils.types import ( - HT, Backend, Bool, DataFrame, Engine, FHConstructor, Float, - FloatZeroToOneExc, Index, Int, IntLargerEqualZero, MetricConstructor, - MetricFunction, NJobs, Pandas, PredictionMethods, PredictionMethodsTS, - Predictor, RowSelector, Scalar, Scorer, Sequence, Stages, TargetSelector, - Verbose, Warnings, XSelector, YSelector, dataframe_t, float_t, int_t, + HT, Backend, Bool, Engine, FHConstructor, Float, FloatZeroToOneExc, Int, + IntLargerEqualZero, MetricConstructor, MetricFunction, NJobs, Pandas, + PredictionMethods, PredictionMethodsTS, Predictor, RowSelector, Scalar, + Scorer, Sequence, Stages, TargetSelector, Verbose, Warnings, XReturn, + XSelector, YReturn, YSelector, float_t, int_t, ) from atom.utils.utils import ( ClassMap, DataConfig, Goal, PlotCallback, ShapExplanation, Task, - TrialsCallback, adjust_verbosity, bk, cache, check_dependency, check_empty, - check_scaling, composed, crash, estimator_has_attr, flt, get_cols, - get_custom_scorer, has_task, it, lst, merge, method_to_log, rnd, sign, - time_to_str, to_pandas, + TrialsCallback, adjust, cache, check_dependency, check_empty, composed, + crash, estimator_has_attr, flt, get_col_names, get_cols, get_custom_scorer, + has_task, it, lst, merge, method_to_log, rnd, sign, time_to_str, to_df, + to_series, to_tabular, ) +if TYPE_CHECKING: + from starlette.requests import Request + + # Disable optuna info logs (ATOM already displays the same info) optuna.logging.set_verbosity(optuna.logging.WARNING) @@ -129,9 +130,16 @@ class BaseModel(RunnerPlot): - "data": + - "numpy" - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" - "pyarrow" - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" - "estimator": @@ -148,6 +156,7 @@ class BaseModel(RunnerPlot): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -264,9 +273,9 @@ def __init__( self._branch = branches.current self._train_idx = len(self.branch._data.train_idx) # Can change for sh and ts - if hasattr(self, "needs_scaling"): - if self.needs_scaling and not check_scaling(self.X, pipeline=self.pipeline): - self.scaler = Scaler().fit(self.X_train) + if getattr(self, "needs_scaling", None) and not self.branch.check_scaling(): + self.scaler = Scaler(device=self.device, engine=self.engine.estimator) + self.scaler.fit(self.X_train) def __repr__(self) -> str: """Display class name.""" @@ -274,17 +283,25 @@ def __repr__(self) -> str: def __dir__(self) -> list[str]: """Add additional attrs from __getattr__ to the dir.""" - attrs = list(super().__dir__()) + # Exclude from _available_if conditions + attrs = [x for x in super().__dir__() if hasattr(self, x)] + if "_branch" in self.__dict__: - attrs += [x for x in dir(self.branch) if not x.startswith("_")] - attrs += list(DF_ATTRS) + # Add additional attrs from the branch + attrs += self.branch._get_shared_attrs() + + # Add additional attrs from the dataset + attrs += [x for x in DF_ATTRS if hasattr(self.dataset, x)] + + # Add column names (excluding those with spaces) attrs += [c for c in self.columns if re.fullmatch(r"\w+$", c)] + return attrs def __getattr__(self, item: str) -> Any: """Get attributes from branch or data.""" if "_branch" in self.__dict__: - if item in dir(self.branch) and not item.startswith("_"): + if item in self.branch._get_shared_attrs(): return getattr(self.branch, item) # Get attr from branch elif item in self.branch.columns: return self.branch.dataset[item] # Get column @@ -485,8 +502,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: def _fit_estimator( self, estimator: Predictor, - data: tuple[DataFrame, Pandas], - validation: tuple[DataFrame, Pandas] | None = None, + data: tuple[pd.DataFrame, Pandas], + validation: tuple[pd.DataFrame, Pandas] | None = None, trial: Trial | None = None, ) -> Predictor: """Fit the estimator and perform in-training validation. @@ -688,7 +705,7 @@ def _get_pred( # Statsmodels models such as SARIMAX and DF require all # exogenous data after the last row of the train set # Other models accept this format - Xe = bk.concat([self.test, self.holdout]) # type: ignore[list-item] + Xe = pd.concat([self.test, self.holdout]) exog = Xe.loc[Xe.index <= X.index.max(), self.features] # type: ignore[index] y_pred = self._prediction( @@ -704,7 +721,7 @@ def _get_pred( f"Failed to get predictions for model {self.name} " f"on rows {rows}. Returning NaN. Exception: {ex}.", 3 ) - y_pred = bk.Series([np.NaN] * len(X), index=X.index) + y_pred = pd.Series([np.NaN] * len(X), index=X.index) else: y_pred = self._prediction(X.index, verbose=0, method=method_caller) @@ -722,7 +739,7 @@ def _score_from_est( self, scorer: Scorer, estimator: Predictor, - X: DataFrame, + X: pd.DataFrame, y: Pandas, **kwargs, ) -> Float: @@ -736,11 +753,11 @@ def _score_from_est( estimator: Predictor Estimator instance to get the score from. - X: dataframe + X: pd.DataFrame Feature set. - y: series or dataframe - Target column corresponding to `X`. + y: pd.Series or pd.DataFrame + Target column(s) corresponding to `X`. **kwargs Additional keyword arguments for the `scorer`. @@ -754,11 +771,10 @@ def _score_from_est( if self.task.is_forecast: y_pred = estimator.predict(fh=y.index, X=check_empty(X)) else: - y_pred = to_pandas( + y_pred = to_tabular( data=estimator.predict(X), index=y.index, - columns=getattr(y, "columns", None), - name=getattr(y, "name", None), + columns=get_col_names(y), ) return self._score_from_pred(scorer, y, y_pred, **kwargs) @@ -854,7 +870,7 @@ def _get_score( and hasattr(self.estimator, "predict_proba") ): y_true, y_pred = self._get_pred(rows, method="predict_proba") - if isinstance(y_pred, dataframe_t): + if isinstance(y_pred, pd.DataFrame): # Update every target column with its corresponding threshold for i, value in enumerate(threshold): y_pred.iloc[:, i] = (y_pred.iloc[:, i] > value).astype("int") @@ -1025,7 +1041,7 @@ def fit_model( args.append(cols) # Parallel loop over fit_model - results = Parallel(n_jobs=self.n_jobs, backend=self.backend)( + results = Parallel(n_jobs=self.n_jobs)( delayed(fit_model)(estimator, i, j) for i, j in splitter.split(*args) ) @@ -1150,7 +1166,7 @@ def fit_model( self._log(f"Time elapsed: {time_to_str(self.trials.iat[-1, -2])}", 1) @composed(crash, method_to_log, beartype) - def fit(self, X: DataFrame | None = None, y: Pandas | None = None): + def fit(self, X: pd.DataFrame | None = None, y: Pandas | None = None): """Fit and validate the model. The estimator is fitted using the best hyperparameters found @@ -1160,12 +1176,12 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None): Parameters ---------- - X: dataframe or None + X: pd.DataFrame or None Feature set with shape=(n_samples, n_features). If None, `self.X_train` is used. - y: series, dataframe or None - Target column corresponding to `X`. If None, `self.y_train` + y: pd.Series, pd.DataFrame or None + Target column(s) corresponding to `X`. If None, `self.y_train` is used. """ @@ -1233,28 +1249,25 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None): sk_model=self.estimator, artifact_path=self._est_class.__name__, signature=infer_signature( - model_input=pd.DataFrame(self.X), + model_input=self.X, model_output=self.estimator.predict(self.X_test.iloc[[0]]), ), - input_example=pd.DataFrame(self.X.iloc[[0]]), + input_example=self.X.iloc[[0]], ) if self.log_data: for ds in ("train", "test"): - mlflow.log_input( - dataset=from_pandas(pd.DataFrame(getattr(self, ds))), - context=ds, - ) + mlflow.log_input(dataset=from_pandas(getattr(self, ds)), context=ds) if self.log_pipeline: mlflow.sklearn.log_model( sk_model=self.export_pipeline(), artifact_path=f"{self._est_class.__name__}_pipeline", signature=infer_signature( - model_input=pd.DataFrame(self.X), + model_input=self.X, model_output=self.estimator.predict(self.X_test.iloc[[0]]), ), - input_example=pd.DataFrame(self.X.iloc[[0]]), + input_example=self.X.iloc[[0]], ) @composed(crash, method_to_log, beartype) @@ -1629,22 +1642,22 @@ def pipeline(self) -> Pipeline: return self.branch.pipeline @property - def dataset(self) -> DataFrame: + def dataset(self) -> pd.DataFrame: """Complete data set.""" return merge(self.X, self.y) @property - def train(self) -> DataFrame: + def train(self) -> pd.DataFrame: """Training set.""" return merge(self.X_train, self.y_train) @property - def test(self) -> DataFrame: + def test(self) -> pd.DataFrame: """Test set.""" return merge(self.X_test, self.y_test) @property - def holdout(self) -> DataFrame | None: + def holdout(self) -> pd.DataFrame | None: """Holdout set.""" if (holdout := self.branch.holdout) is not None: if self.scaler: @@ -1655,23 +1668,24 @@ def holdout(self) -> DataFrame | None: return None @property - def X(self) -> DataFrame: + def X(self) -> pd.DataFrame: """Feature set.""" - return bk.concat([self.X_train, self.X_test]) + return pd.concat([self.X_train, self.X_test]) @property def y(self) -> Pandas: - """Target column.""" - return bk.concat([self.y_train, self.y_test]) + """Target column(s).""" + return pd.concat([self.y_train, self.y_test]) @property - def X_train(self) -> DataFrame: + def X_train(self) -> pd.DataFrame: """Features of the training set.""" features = self.branch.features.isin(self._config.ignore) + X_train = self.branch.X_train.iloc[-self._train_idx:, ~features] if self.scaler: - return self.scaler.transform(self.branch.X_train.iloc[-self._train_idx:, ~features]) + return cast(pd.DataFrame, self.scaler.transform(X_train)) else: - return self.branch.X_train.iloc[-self._train_idx:, ~features] + return X_train @property def y_train(self) -> Pandas: @@ -1679,16 +1693,17 @@ def y_train(self) -> Pandas: return self.branch.y_train[-self._train_idx:] @property - def X_test(self) -> DataFrame: + def X_test(self) -> pd.DataFrame: """Features of the test set.""" features = self.branch.features.isin(self._config.ignore) + X_test = self.branch.X_test.iloc[:, ~features] if self.scaler: - return self.scaler.transform(self.branch.X_test.iloc[:, ~features]) + return cast(pd.DataFrame, self.scaler.transform(X_test)) else: - return self.branch.X_test.iloc[:, ~features] + return X_test @property - def X_holdout(self) -> DataFrame | None: + def X_holdout(self) -> pd.DataFrame | None: """Features of the holdout set.""" if self.holdout is not None: return self.holdout[self.features] @@ -1709,34 +1724,34 @@ def shape(self) -> tuple[Int, Int]: return self.dataset.shape @property - def columns(self) -> Index: + def columns(self) -> list[str]: """Name of all the columns.""" - return self.dataset.columns + return list(self.dataset.columns) @property - def n_columns(self) -> Int: + def n_columns(self) -> int: """Number of columns.""" return len(self.columns) @property - def features(self) -> Index: + def features(self) -> list[str]: """Name of the features.""" - return self.columns[:-self.branch._data.n_cols] + return list(self.columns[:-self.branch._data.n_targets]) @property - def n_features(self) -> Int: + def n_features(self) -> int: """Number of features.""" return len(self.features) @property - def _all(self) -> DataFrame: + def _all(self) -> pd.DataFrame: """Dataset + holdout. Note that calling this property triggers the holdout set calculation. """ - return bk.concat([self.dataset, self.holdout]) + return pd.concat([self.dataset, self.holdout]) # Utility methods ============================================== >> @@ -1837,8 +1852,7 @@ def inference(*X) -> Scalar | str | list[Scalar | str]: """ conv = lambda elem: elem.item() if hasattr(elem, "item") else elem - y_pred = self.inverse_transform(y=self.predict([X], verbose=0), verbose=0) - if isinstance(y_pred, dataframe_t): + if isinstance(y_pred := self.predict([X], verbose=0), pd.DataFrame): return [conv(elem) for elem in y_pred.iloc[0, :]] else: return conv(y_pred[0]) @@ -1859,7 +1873,7 @@ def inference(*X) -> Scalar | str | list[Scalar | str]: self.app = Interface( fn=inference, inputs=inputs, - outputs=["label"] * self.branch._data.n_cols, + outputs=["label"] * self.branch._data.n_targets, allow_flagging=kwargs.pop("allow_flagging", "never"), **{k: v for k, v in kwargs.items() if k in sign(Interface)}, ) @@ -2082,12 +2096,12 @@ def evaluate( """ if isinstance(threshold, float_t): - threshold_c = [threshold] * self.branch._data.n_cols # Length=n_targets - elif len(threshold) != self.branch._data.n_cols: + threshold_c = [threshold] * self.branch._data.n_targets # Length=n_targets + elif len(threshold) != self.branch._data.n_targets: raise ValueError( "Invalid value for the threshold parameter. The length of the list " f"list should be equal to the number of target columns, got len(target)" - f"={self.branch._data.n_cols} and len(threshold)={len(threshold)}." + f"={self.branch._data.n_targets} and len(threshold)={len(threshold)}." ) else: threshold_c = list(threshold) @@ -2184,11 +2198,11 @@ def full_train(self, *, include_holdout: Bool = False): if include_holdout and self.holdout is None: raise ValueError("No holdout data set available.") - if include_holdout and self.holdout is not None: - X = bk.concat([self.X, self.X_holdout]) - y = bk.concat([self.y, self.y_holdout]) - else: + if not include_holdout: X, y = self.X, self.y + else: + X = pd.concat([self.X, self.X_holdout]) + y = pd.concat([self.y, self.y_holdout]) # Assign a mlflow run to the new estimator if self.experiment: @@ -2234,11 +2248,11 @@ def inverse_transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Inversely transform new data through the pipeline. Transformers that are only applied on the training set are - skipped. The rest should all implement a `inverse_transform` + skipped. The rest should all implement an `inverse_transform` method. If only `X` or only `y` is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target @@ -2249,18 +2263,17 @@ def inverse_transform( ---------- X: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). - If None, X is ignored in the transformers. + If None, `X` is ignored in the transformers. - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: int, str, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If @@ -2275,10 +2288,10 @@ def inverse_transform( Original target column. Only returned if provided. """ - X, y = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) + Xt, yt = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) - with adjust_verbosity(self.pipeline, verbose) as pipeline: - return pipeline.inverse_transform(X, y) + with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl: + return pl.inverse_transform(Xt, yt) @composed(crash, method_to_log, beartype) def register( @@ -2378,8 +2391,11 @@ def serve(self, method: str = "predict", host: str = "127.0.0.1", port: Int = 80 Port for HTTP server. """ + check_dependency("ray") + import ray + from ray.serve import deployment, run - @serve.deployment + @deployment class ServeModel: """Model deployment class. @@ -2413,16 +2429,12 @@ async def __call__(self, request: Request) -> np.ndarray: """ payload = await request.json() - return getattr(self.pipeline, self.method)(bk.read_json(payload)) + return getattr(self.pipeline, self.method)(pd.read_json(payload)) if not ray.is_initialized(): ray.init(log_to_driver=False) - server = ServeModel.bind( - pipeline=self.export_pipeline(), - method=method, - ) - serve.run(server, host=host, port=port) + run(ServeModel.bind(pipeline=self.export_pipeline(), method=method), host=host, port=port) self._log(f"Serving model {self.fullname} on {host}:{port}...", 1) @@ -2433,7 +2445,7 @@ def transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Transform new data through the pipeline. Transformers that are only applied on the training set are @@ -2447,19 +2459,18 @@ def transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. If None, - X is ignored in the transformers. + `X` is ignored. If None, + `X` is ignored in the transformers. - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: int, str, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If @@ -2474,10 +2485,10 @@ def transform( Transformed target column. Only returned if provided. """ - X, y = self._check_input(X, y, columns=self.og.features, name=self.og.target) + Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) - with adjust_verbosity(self.pipeline, verbose) as pipeline: - return pipeline.transform(X, y) + with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl: + return pl.transform(Xt, yt) class ClassRegModel: @@ -2517,20 +2528,25 @@ def _prediction( y: YSelector | None = ..., metric: str | MetricFunction | Scorer | None = ..., sample_weight: Sequence[Scalar] | None = ..., - verbose: Int | None = ..., - method: Literal["score"] = ..., - ) -> Float: ... + verbose: Verbose | None = ..., + method: Literal[ + "decision_function", + "predict", + "predict_log_proba", + "predict_proba", + ] = ..., + ) -> Pandas: ... @overload def _prediction( self, X: RowSelector | XSelector, - y: YSelector | None = ..., - metric: str | MetricFunction | Scorer | None = ..., - sample_weight: Sequence[Scalar] | None = ..., - verbose: Int | None = ..., - method: PredictionMethods = ..., - ) -> Pandas: ... + y: YSelector | None, + metric: str | MetricFunction | Scorer | None, + sample_weight: Sequence[Scalar] | None, + verbose: Verbose | None, + method: Literal["score"], + ) -> Float: ... def _prediction( self, @@ -2538,7 +2554,7 @@ def _prediction( y: YSelector | None = None, metric: str | MetricFunction | Scorer | None = None, sample_weight: Sequence[Scalar] | None = None, - verbose: Int | None = None, + verbose: Verbose | None = None, method: PredictionMethods = "predict", ) -> Float | Pandas: """Get predictions on new data or existing rows. @@ -2554,13 +2570,12 @@ def _prediction( set with shape=(n_samples, n_features) to make predictions on. - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: int, str, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -2590,30 +2605,38 @@ def _prediction( """ - def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[DataFrame, Pandas]: + def get_transform_X_y( + X: RowSelector | XSelector, + y: YSelector | None, + ) -> tuple[pd.DataFrame, Pandas | None]: """Get X and y from the pipeline transformation. Parameters ---------- - X: dataframe-like - Feature set. + X: hashable, segment, sequence or dataframe-like + Feature set. If not dataframe-like, expected to fail. - y: int, str or sequence - Target column. + y: int, str, sequence, dataframe-like or None + Target column(s) corresponding to `X`. Returns ------- dataframe Transformed feature set. - series or dataframe + series, dataframe or None Transformed target column. """ - if isinstance(out := self.transform(X, y, verbose=verbose), tuple): + Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) + + with adjust(self.pipeline, verbose=verbose) as pl: + out = pl.transform(Xt, yt) + + if isinstance(out, tuple): return out else: - return out, y + return out, yt def assign_prediction_columns() -> list[str]: """Assign column names for the prediction methods. @@ -2630,7 +2653,7 @@ def assign_prediction_columns() -> list[str]: return self.mapping.get(self.target, np.unique(self.y).astype(str)) try: - if isinstance(X, dataframe_t): + if isinstance(X, pd.DataFrame): # Dataframe must go first since we can expect # prediction calls from dataframes with reset indices Xt, yt = get_transform_X_y(X, y) @@ -2645,32 +2668,27 @@ def assign_prediction_columns() -> list[str]: if method != "score": pred = np.array(self.memory.cache(getattr(self.estimator, method))(Xt[self.features])) - if pred.ndim < 3: - data = to_pandas( - data=pred, - index=Xt.index, - name=self.target, - columns=assign_prediction_columns(), - ) + if pred.ndim == 1 or pred.shape[1] == 1: + return to_series(pred, index=Xt.index, name=self.target) + elif pred.ndim < 3: + return to_df(pred, index=Xt.index, columns=assign_prediction_columns()) elif self.task is Task.multilabel_classification: # Convert to (n_samples, n_targets) - data = bk.DataFrame( + return pd.DataFrame( data=np.array([d[:, 1] for d in pred]).T, index=Xt.index, columns=assign_prediction_columns(), ) else: # Convert to (n_samples * n_classes, n_targets) - data = bk.DataFrame( + return pd.DataFrame( data=pred.reshape(-1, pred.shape[2]), - index=bk.MultiIndex.from_tuples( + index=pd.MultiIndex.from_tuples( [(col, idx) for col in np.unique(self.y) for idx in Xt.index] ), columns=assign_prediction_columns(), ) - return data - else: if metric is None: scorer = self._metric[0] @@ -2691,8 +2709,8 @@ def decision_function( self, X: RowSelector | XSelector, *, - verbose: Int | None = None, - ) -> Pandas: + verbose: Verbose | None = None, + ) -> YReturn: """Get confidence scores on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2721,7 +2739,7 @@ def decision_function( multiclass classification tasks. """ - return self._prediction(X, verbose=verbose, method="decision_function") + return self._convert(self._prediction(X, verbose=verbose, method="decision_function")) @available_if(estimator_has_attr("predict")) @composed(crash, method_to_log, beartype) @@ -2730,8 +2748,8 @@ def predict( X: RowSelector | XSelector, *, inverse: Bool = True, - verbose: Int | None = None, - ) -> Pandas: + verbose: Verbose | None = None, + ) -> YReturn: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2769,7 +2787,7 @@ def predict( if inverse: return self.inverse_transform(y=pred) else: - return pred + return self._convert(pred) @available_if(estimator_has_attr("predict_log_proba")) @composed(crash, method_to_log, beartype) @@ -2777,8 +2795,8 @@ def predict_log_proba( self, X: RowSelector | XSelector, *, - verbose: Int | None = None, - ) -> DataFrame: + verbose: Verbose | None = None, + ) -> XReturn: """Get class log-probabilities on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2806,7 +2824,7 @@ def predict_log_proba( a multiindex format for [multioutput tasks][]. """ - return self._prediction(X, verbose=verbose, method="predict_log_proba") + return self._convert(self._prediction(X, verbose=verbose, method="predict_log_proba")) @available_if(estimator_has_attr("predict_proba")) @composed(crash, method_to_log, beartype) @@ -2814,8 +2832,8 @@ def predict_proba( self, X: RowSelector | XSelector, *, - verbose: Int | None = None, - ) -> DataFrame: + verbose: Verbose | None = None, + ) -> XReturn: """Get class probabilities on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2843,7 +2861,7 @@ def predict_proba( a multiindex format for [multioutput tasks][]. """ - return self._prediction(X, verbose=verbose, method="predict_proba") + return self._convert(self._prediction(X, verbose=verbose, method="predict_proba")) @available_if(estimator_has_attr("score")) @composed(crash, method_to_log, beartype) @@ -2854,7 +2872,7 @@ def score( *, metric: str | MetricFunction | Scorer | None = None, sample_weight: Sequence[Scalar] | None = None, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> Float: """Get a metric score on new data. @@ -2876,13 +2894,12 @@ def score( set with shape=(n_samples, n_features) to make predictions on. - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: int, str, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. - If None: `X` must be a selection of rows in the dataset. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -2947,26 +2964,44 @@ def get_tags(self) -> dict[str, Any]: @overload def _prediction( self, - fh: RowSelector | FHConstructor | None = None, - y: RowSelector | YSelector | None = None, - X: XSelector | None = None, - metric: str | MetricFunction | Scorer | None = None, - verbose: Int | None = None, - method: Literal["score"] = ..., + fh: RowSelector | FHConstructor | None = ..., + y: RowSelector | YSelector | None = ..., + X: XSelector | None = ..., + metric: str | MetricFunction | Scorer | None = ..., + verbose: Verbose | None = ..., + method: Literal[ + "predict", + "predict_interval", + "predict_quantiles", + "predict_residuals", + "predict_var", + ] = ..., **kwargs, - ) -> Float: ... + ) -> Pandas: ... @overload def _prediction( self, - fh: RowSelector | FHConstructor | None = None, - y: RowSelector | YSelector | None = None, - X: XSelector | None = None, - metric: str | MetricFunction | Scorer | None = None, - verbose: Int | None = None, - method: PredictionMethodsTS = ..., + fh: RowSelector | FHConstructor | None, + y: RowSelector | YSelector | None, + X: XSelector | None, + metric: str | MetricFunction | Scorer | None, + verbose: Verbose | None, + method: Literal["predict_proba"], **kwargs, - ) -> Pandas: ... + ) -> Normal: ... + + @overload + def _prediction( + self, + fh: RowSelector | FHConstructor | None, + y: RowSelector | YSelector | None, + X: XSelector | None, + metric: str | MetricFunction | Scorer | None, + verbose: Verbose | None, + method: Literal["score"], + **kwargs, + ) -> Float: ... def _prediction( self, @@ -2974,10 +3009,10 @@ def _prediction( y: RowSelector | YSelector | None = None, X: XSelector | None = None, metric: str | MetricFunction | Scorer | None = None, - verbose: Int | None = None, + verbose: Verbose | None = None, method: PredictionMethodsTS = "predict", **kwargs, - ) -> Float | Pandas: + ) -> Float | Normal | Pandas: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2990,7 +3025,7 @@ def _prediction( The [forecasting horizon][row-and-column-selection] encoding the time stamps to forecast at. - y: int, str, dict, sequence, dataframe or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None @@ -3014,18 +3049,23 @@ def _prediction( Returns ------- - float, series or dataframe + float, sktime.proba.[Normal][], series or dataframe Calculated predictions. The return type depends on the method called. """ if y is not None or X is not None: - if isinstance(out := self.transform(X, y, verbose=verbose), tuple): + Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) + + with adjust(self.pipeline, verbose=verbose) as pl: + out = pl.transform(Xt, yt) + + if isinstance(out, tuple): Xt, yt = out elif X is not None: - Xt, yt = out, y + Xt, yt = out, yt else: - Xt, yt = X, out + Xt, yt = Xt, out else: Xt, yt = X, y @@ -3051,8 +3091,9 @@ def predict( fh: RowSelector | FHConstructor, X: XSelector | None = None, *, - verbose: Int | None = None, - ) -> Pandas: + inverse: Bool = True, + verbose: Verbose | None = None, + ) -> YReturn: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3070,6 +3111,12 @@ def predict( X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to `fh`. + inverse: bool, default=True + Whether to inversely transform the output through the + pipeline. This doesn't affect the predictions if there are + no transformers in the pipeline or if the transformers have + no `inverse_transform` method or don't apply to `y`. + verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. @@ -3081,7 +3128,12 @@ def predict( n_targets) for [multivariate][] tasks. """ - return self._prediction(fh=fh, X=X, verbose=verbose, method="predict") + pred = self._prediction(fh=fh, X=X, verbose=verbose, method="predict") + + if inverse: + return self.inverse_transform(y=pred) + else: + return self._convert(pred) @available_if(estimator_has_attr("predict_interval")) @composed(crash, method_to_log, beartype) @@ -3091,8 +3143,8 @@ def predict_interval( X: XSelector | None = None, *, coverage: Float | Sequence[Float] = 0.9, - verbose: Int | None = None, - ) -> DataFrame: + verbose: Verbose | None = None, + ) -> XReturn: """Get prediction intervals on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3123,12 +3175,14 @@ def predict_interval( Computed interval forecasts. """ - return self._prediction( - fh=fh, - X=X, - coverage=coverage, - verbose=verbose, - method="predict_interval", + return self._convert( + self._prediction( + fh=fh, + X=X, + coverage=coverage, + verbose=verbose, + method="predict_interval", + ) ) @available_if(estimator_has_attr("predict_proba")) @@ -3139,7 +3193,7 @@ def predict_proba( X: XSelector | None = None, *, marginal: Bool = True, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> Normal: """Get probabilistic forecasts on new data or existing rows. @@ -3187,8 +3241,8 @@ def predict_quantiles( X: XSelector | None = None, *, alpha: Float | Sequence[Float] = (0.05, 0.95), - verbose: Int | None = None, - ) -> DataFrame: + verbose: Verbose | None = None, + ) -> XReturn: """Get quantile forecasts on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3220,12 +3274,14 @@ def predict_quantiles( Computed quantile forecasts. """ - return self._prediction( - fh=fh, - X=X, - alpha=alpha, - verbose=verbose, - method="predict_quantiles", + return self._convert( + self._prediction( + fh=fh, + X=X, + alpha=alpha, + verbose=verbose, + method="predict_quantiles", + ) ) @available_if(estimator_has_attr("predict_residuals")) @@ -3235,8 +3291,8 @@ def predict_residuals( y: RowSelector | YSelector, X: XSelector | None = None, *, - verbose: Int | None = None, - ) -> Pandas: + verbose: Verbose | None = None, + ) -> YReturn: """Get residuals of forecasts on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3247,7 +3303,7 @@ def predict_residuals( Parameters ---------- - y: int, str, dict, sequence or dataframe + y: int, str, sequence or dataframe Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None @@ -3264,7 +3320,9 @@ def predict_residuals( n_targets) for [multivariate][] tasks. """ - return self._prediction(y=y, X=X, verbose=verbose, method="predict_residuals") + return self._convert( + self._prediction(y=y, X=X, verbose=verbose, method="predict_residuals") + ) @available_if(estimator_has_attr("predict_var")) @composed(crash, method_to_log, beartype) @@ -3274,8 +3332,8 @@ def predict_var( X: XSelector | None = None, *, cov: Bool = False, - verbose: Int | None = None, - ) -> DataFrame: + verbose: Verbose | None = None, + ) -> XReturn: """Get variance forecasts on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3307,12 +3365,14 @@ def predict_var( Computed variance forecasts. """ - return self._prediction( - fh=fh, - X=X, - cov=cov, - verbose=verbose, - method="predict_var", + return self._convert( + self._prediction( + fh=fh, + X=X, + cov=cov, + verbose=verbose, + method="predict_var", + ) ) @available_if(estimator_has_attr("score")) @@ -3324,7 +3384,7 @@ def score( fh: RowSelector | FHConstructor | None = None, *, metric: str | MetricFunction | Scorer | None = None, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> Float: """Get a metric score on new data. @@ -3341,7 +3401,7 @@ def score( Parameters ---------- - y: int, str, dict, sequence or dataframe + y: int, str, sequence or dataframe-like Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None diff --git a/atom/baserunner.py b/atom/baserunner.py index c85f95222..bc128d132 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -32,19 +32,18 @@ from atom.basetracker import BaseTracker from atom.basetransformer import BaseTransformer -from atom.branch import Branch +from atom.data import Branch from atom.models import MODELS, Stacking, Voting from atom.pipeline import Pipeline from atom.utils.constants import DF_ATTRS from atom.utils.types import ( - Bool, DataFrame, FloatZeroToOneExc, HarmonicsSelector, IndexSelector, Int, + Bool, FloatZeroToOneExc, HarmonicsSelector, IndexSelector, Int, IntLargerOne, MetricConstructor, Model, ModelSelector, ModelsSelector, - Pandas, RowSelector, Seasonality, Segment, Sequence, Series, SPDict, - SPTuple, TargetSelector, YSelector, bool_t, dataframe_t, int_t, segment_t, - sequence_t, + Pandas, RowSelector, Seasonality, Segment, Sequence, SPDict, SPTuple, + TargetSelector, YSelector, bool_t, int_t, pandas_t, segment_t, sequence_t, ) from atom.utils.utils import ( - ClassMap, DataContainer, Goal, SeasonalPeriod, Task, bk, check_is_fitted, + ClassMap, DataContainer, Goal, SeasonalPeriod, Task, check_is_fitted, composed, crash, divide, flt, get_cols, get_segment, get_versions, has_task, lst, merge, method_to_log, n_cols, ) @@ -80,27 +79,42 @@ def __setstate__(self, state: dict[str, Any]): def __dir__(self) -> list[str]: """Add additional attrs from __getattr__ to the dir.""" - attrs = list(super().__dir__()) - attrs += [x for x in dir(self.branch) if not x.startswith("_")] - attrs += list(DF_ATTRS) + # Exclude from _available_if conditions + attrs = [x for x in super().__dir__() if hasattr(self, x)] + + # Add additional attrs from the branch + attrs += self.branch._get_shared_attrs() + + # Add additional attrs from the dataset + attrs += [x for x in DF_ATTRS if hasattr(self.dataset, x)] + + # Add branch names in lower-case attrs += [b.name.lower() for b in self._branches] + + # Add column names (excluding those with spaces) attrs += [c for c in self.columns if re.fullmatch(r"\w+$", c)] + + # Add model names in lower-case if isinstance(self._models, ClassMap): attrs += [m.name.lower() for m in self._models] + return attrs def __getattr__(self, item: str) -> Any: """Get branch, attr from branch, model, column or attr from dataset.""" if item in self.__dict__["_branches"]: return self._branches[item] # Get branch - elif item in dir(self.branch) and not item.startswith("_"): - return getattr(self.branch, item) # Get attr from branch + elif item in self.branch._get_shared_attrs(): + if isinstance(attr := getattr(self.branch, item), pandas_t): + return self._convert(attr) # Transform data through data engine + else: + return attr elif item in self.__dict__["_models"]: return self._models[item] # Get model elif item in self.branch.columns: return self.branch.dataset[item] # Get column from dataset - elif item in DF_ATTRS: - return getattr(self.branch.dataset, item) # Get attr from dataset + elif item in DF_ATTRS and hasattr(self.dataset, item): + return getattr(self.dataset, item) # Get attr from dataset else: raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'.") @@ -120,7 +134,7 @@ def __delattr__(self, item: str): def __len__(self) -> int: """Return length of dataset.""" - return len(self.dataset) + return len(self.branch.dataset) def __contains__(self, item: str) -> bool: """Whether the item is a column in the dataset.""" @@ -159,7 +173,7 @@ def __sklearn_is_fitted__(self) -> bool: @cached_property def task(self) -> Task: """Dataset's [task][] type.""" - return self._goal.infer_task(self.y) + return self._goal.infer_task(self.branch.y) @property def sp(self) -> SPTuple: @@ -202,14 +216,14 @@ def branch(self) -> Branch: return self._branches.current @property - def holdout(self) -> DataFrame | None: + def holdout(self) -> pd.DataFrame | None: """Holdout set. This data set is untransformed by the pipeline. Read more in the [user guide][data-sets]. """ - return self.branch._holdout + return self._convert(self.branch._holdout) @property def models(self) -> str | list[str] | None: @@ -378,11 +392,11 @@ def get_single_sp(sp: Int | str) -> int: def _get_data( self, - arrays: tuple, + arrays: tuple[Any, ...], y: YSelector = -1, *, - index: IndexSelector = False, - ) -> tuple[DataContainer, DataFrame | None]: + index: IndexSelector | None = None, + ) -> tuple[DataContainer, pd.DataFrame | None]: """Get data sets from a sequence of indexables. Also assigns an index, (stratified) shuffles and selects a @@ -396,20 +410,21 @@ def _get_data( y: int, str or sequence, default=-1 Transformed target column. - index: bool, int, str or sequence, default=False - Index parameter as provided in constructor. + index: bool, int, str, sequence or None, default=None + Index parameter as provided in constructor. If None, the + index is retrieved from `self._config`. Returns ------- DataContainer Train and test sets. - dataframe or None + pd.DataFrame or None Holdout data set. Returns None if not specified. """ - def _subsample(df: DataFrame) -> DataFrame: + def _subsample(df: pd.DataFrame) -> pd.DataFrame: """Select a random subset of a dataframe. If shuffle=True, the subset is shuffled, else row order @@ -418,12 +433,12 @@ def _subsample(df: DataFrame) -> DataFrame: Parameters ---------- - df: dataframe + df: pd.DataFrame Dataset. Returns ------- - dataframe + pd.DataFrame Subset of df. """ @@ -439,25 +454,36 @@ def _subsample(df: DataFrame) -> DataFrame: else: return df.iloc[sorted(random.sample(range(len(df)), k=n_rows))] - def _set_index(df: DataFrame, y: Pandas | None) -> DataFrame: + def _set_index( + df: pd.DataFrame, + y: Pandas | None, + index: IndexSelector | None = None, + ) -> pd.DataFrame: """Assign an index to the dataframe. Parameters ---------- - df: dataframe + df: pd.DataFrame Dataset. - y: series, dataframe or None + y: pd.Series, pd.DataFrame or None Target column(s). Used to check that the provided index is not one of the target columns. If None, the check is skipped. + index: bool, int, str or sequence or None, default=None + Index parameter as provided in constructor. If None, the + index is retrieved from `self._config`. + Returns ------- - dataframe + pd.DataFrame Dataset with updated indices. """ + if index is None: + index = self._config.index + if index is True: # True gets caught by isinstance(int) pass elif index is False: @@ -494,9 +520,9 @@ def _set_index(df: DataFrame, y: Pandas | None) -> DataFrame: return df def _no_data_sets( - X: DataFrame, + X: pd.DataFrame, y: Pandas, - ) -> tuple[DataContainer, DataFrame | None]: + ) -> tuple[DataContainer, pd.DataFrame | None]: """Generate data sets from one dataset. Additionally, assigns an index, shuffles the data, selects @@ -505,10 +531,10 @@ def _no_data_sets( Parameters ---------- - X: dataframe + X: pd.DataFrame Feature set with shape=(n_samples, n_features). - y: series or dataframe + y: pd.Series or pd.DataFrame Target column(s) corresponding to `X`. Returns @@ -516,7 +542,7 @@ def _no_data_sets( DataContainer Train and test sets. - dataframe or None + pd.DataFrame or None Holdout data set. Returns None if not specified. """ @@ -536,7 +562,7 @@ def _no_data_sets( "Invalid value for the index parameter. Length of index " f"({len(index)}) doesn't match that of the dataset ({len(data)})." ) - data.index = index + data.index = pd.Index(index) if len(data) < 5: raise ValueError( @@ -589,23 +615,22 @@ def _no_data_sets( stratify=self._config.get_stratify_columns(data, y), ) - complete_set = _set_index(bk.concat([train, test, holdout]), y) + complete_set = _set_index(pd.concat([train, test, holdout]), y, index) container = DataContainer( data=(data := complete_set.iloc[: len(data)]), train_idx=data.index[:-len(test)], test_idx=data.index[-len(test):], - n_cols=len(get_cols(y)), + n_targets=n_cols(y), ) except ValueError as ex: # Clarify common error with stratification for multioutput tasks - if "least populated class" in str(ex) and isinstance(y, dataframe_t): + if isinstance(y, pd.DataFrame): raise ValueError( "Stratification for multioutput tasks is applied over all target " - "columns, which results in a least populated class that has only " - "one member. Either select only one column to stratify over, or " - "set the parameter stratify=False." + "columns. Either select only one column to stratify over, or set " + "the parameter stratify=False." ) from ex else: raise ex @@ -616,13 +641,13 @@ def _no_data_sets( return container, holdout def _has_data_sets( - X_train: DataFrame, + X_train: pd.DataFrame, y_train: Pandas, - X_test: DataFrame, + X_test: pd.DataFrame, y_test: Pandas, - X_holdout: DataFrame | None = None, + X_holdout: pd.DataFrame | None = None, y_holdout: Pandas | None = None, - ) -> tuple[DataContainer, DataFrame | None]: + ) -> tuple[DataContainer, pd.DataFrame | None]: """Generate data sets from provided sets. Additionally, assigns an index, shuffles the data and @@ -630,22 +655,22 @@ def _has_data_sets( Parameters ---------- - X_train: dataframe + X_train: pd.DataFrame Training set. - y_train: series or dataframe + y_train: pd.Series or pd.DataFrame Target column(s) corresponding to `X`_train. - X_test: dataframe + X_test: pd.DataFrame Test set. - y_test: series or dataframe + y_test: pd.Series or pd.DataFrame Target column(s) corresponding to `X`_test. - X_holdout: dataframe or None - Holdout set. Is None if not provided by the user. + X_holdout: pd.DataFrame or None, default=None + Holdout set. Can be None if not provided by the user. - y_holdout: series, dataframe or None + y_holdout: pd.Series, pd.DataFrame or None, default=None Target column(s) corresponding to `X`_holdout. Returns @@ -653,7 +678,7 @@ def _has_data_sets( DataContainer Train and test sets. - dataframe or None + pd.DataFrame or None Holdout data set. Returns None if not specified. """ @@ -696,18 +721,18 @@ def _has_data_sets( "Invalid value for the index parameter. Length of index " f"({len(index)}) doesn't match that of the data sets ({len_data})." ) - train.index = index[: len(train)] - test.index = index[len(train): len(train) + len(test)] + train.index = pd.Index(index[: len(train)]) + test.index = pd.Index(index[len(train): len(train) + len(test)]) if holdout is not None: - holdout.index = index[-len(holdout):] + holdout.index = pd.Index(index[-len(holdout):]) - complete_set = _set_index(bk.concat([train, test, holdout]), y_test) + complete_set = _set_index(pd.concat([train, test, holdout]), y_test, index) container = DataContainer( data=(data := complete_set.iloc[:len(train) + len(test)]), train_idx=data.index[: len(train)], test_idx=data.index[-len(test):], - n_cols=len(get_cols(y_train)), + n_targets=n_cols(y_train), ) if holdout is not None: @@ -718,16 +743,16 @@ def _has_data_sets( # Process input arrays ===================================== >> if len(arrays) == 0: - if self._goal.name == "forecast" and not isinstance(y, (*int_t, str)): + if self.branch._container: + return self.branch._data, self.branch._holdout + elif self._goal is Goal.forecast and not isinstance(y, (*int_t, str)): # arrays=() and y=y for forecasting sets = _no_data_sets(*self._check_input(y=y)) - elif not self.branch._container: + else: raise ValueError( "The data arrays are empty! Provide the data to run the pipeline " "successfully. See the documentation for the allowed formats." ) - else: - return self.branch._data, self.branch._holdout elif len(arrays) == 1: # X or y for forecasting @@ -787,7 +812,7 @@ def _has_data_sets( if self._goal.name == "forecast": # For forecasting, check if index complies with sktime's standard valid, msg, _ = check_is_mtype( - obj=pd.DataFrame(bk.concat([sets[0].data, sets[1]])), + obj=pd.DataFrame(pd.concat([sets[0].data, sets[1]])), mtype="pd.DataFrame", return_metadata=True, var_name="the dataset", @@ -797,7 +822,7 @@ def _has_data_sets( raise ValueError(msg) else: # Else check for duplicate indices - if bk.concat([sets[0].data, sets[1]]).index.duplicated().any(): + if pd.concat([sets[0].data, sets[1]]).index.duplicated().any(): raise ValueError( "Duplicate indices found in the dataset. " "Try initializing atom using `index=False`." @@ -1106,7 +1131,7 @@ def export_pipeline(self, model: str | Model | None = None) -> Pipeline: def get_class_weight( self, rows: RowSelector = "train", - ) -> dict[Hashable, float] | dict[str, dict[Hashable, float]]: + ) -> dict[Hashable, float] | dict[Hashable, dict[Hashable, float]]: """Return class weights for a balanced data set. Statistically, the class weights re-balance the data set so @@ -1128,12 +1153,12 @@ def get_class_weight( """ - def get_weights(col: Series) -> dict[Hashable, float]: + def get_weights(col: pd.Series) -> dict[Hashable, float]: """Get the class weights for one column. Parameters ---------- - col: series + col: pd.Series Column to get the weights from. Returns @@ -1147,14 +1172,14 @@ def get_weights(col: Series) -> dict[Hashable, float]: _, y = self.branch._get_rows(rows, return_X_y=True) - if self.task.is_multioutput: - return {str(col.name): get_weights(col) for col in get_cols(y)} - else: + if isinstance(y, pd.Series): return get_weights(y) + else: + return {col.name: get_weights(col) for col in get_cols(y)} @available_if(has_task("classification")) @composed(crash, beartype) - def get_sample_weight(self, rows: RowSelector = "train") -> Series: + def get_sample_weight(self, rows: RowSelector = "train") -> pd.Series: """Return sample weights for a balanced data set. The returned weights are inversely proportional to the class @@ -1169,13 +1194,13 @@ def get_sample_weight(self, rows: RowSelector = "train") -> Series: Returns ------- - series + pd.Series Sequence of weights with shape=(n_samples,). """ _, y = self.branch._get_rows(rows, return_X_y=True) weights = compute_sample_weight("balanced", y=y) - return bk.Series(weights, name="sample_weight").round(3) + return pd.Series(weights, name="sample_weight").round(3) @available_if(has_task("forecast")) @composed(crash, beartype) diff --git a/atom/basetrainer.py b/atom/basetrainer.py index b88819538..3a74b1c49 100644 --- a/atom/basetrainer.py +++ b/atom/basetrainer.py @@ -12,15 +12,13 @@ from datetime import datetime as dt from typing import Any -import joblib import mlflow import numpy as np -import ray from joblib import Parallel, delayed from optuna import Study, create_study from atom.baserunner import BaseRunner -from atom.branch import BranchManager +from atom.data import BranchManager from atom.data_cleaning import BaseTransformer from atom.models import MODELS, CustomModel from atom.plots import RunnerPlot @@ -70,7 +68,7 @@ def __init__( self._models = lst(models) if models is not None else ClassMap() self._metric = lst(metric) if metric is not None else ClassMap() - self._config = DataConfig() + self._config = DataConfig(index=self._goal is Goal.forecast) self._branches = BranchManager(memory=self.memory) self._n_trials = {} @@ -374,14 +372,20 @@ def execute_model(m: Model) -> Model | None: m.verbose = self.verbose if self.backend == "ray": + import ray + # This implementation is more efficient than through joblib's # ray backend. The difference is that in this one you start N # tasks, and in the other, you start N actors and then have them # each run the function execute_remote = ray.remote(num_cpus=self.n_jobs)(execute_model) models = ray.get([execute_remote.remote(m) for m in self._models]) + elif self.backend == "dask": + import dask + + models = dask.compute(*[dask.delayed(execute_model)(m) for m in self._models]) else: - models = Parallel(n_jobs=self.n_jobs, backend=self.backend)( + models = Parallel(n_jobs=self.n_jobs)( delayed(execute_model)(m) for m in self._models ) @@ -391,8 +395,7 @@ def execute_model(m: Model) -> Model | None: m.verbose = vb else: - with joblib.parallel_backend(backend=self.backend): - models = [model for m in self._models if (model := execute_model(m))] + models = [model for m in self._models if (model := execute_model(m))] self._models = ClassMap(m for m in models if m) diff --git a/atom/basetransformer.py b/atom/basetransformer.py index d5697754f..859d3f930 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -13,7 +13,6 @@ import tempfile import warnings from collections.abc import Hashable -from copy import deepcopy from datetime import datetime as dt from importlib import import_module from importlib.util import find_spec @@ -22,26 +21,23 @@ from pathlib import Path from typing import Literal, TypeVar, overload -import dagshub +import joblib import mlflow import numpy as np -import ray -import requests +import pandas as pd from beartype import beartype -from dagshub.auth.token_auth import HTTPBearerAuth from joblib.memory import Memory from pandas._typing import Axes -from ray.util.joblib import register_ray from sklearn.utils.validation import check_memory from atom.utils.types import ( - Backend, Bool, DataFrame, Engine, EngineDataOptions, - EngineEstimatorOptions, EngineTuple, Estimator, FeatureNamesOut, Int, - IntLargerEqualZero, Pandas, Sequence, Severity, Verbose, Warnings, - XSelector, YSelector, bool_t, dataframe_t, int_t, sequence_t, + Backend, Bool, Engine, EngineDataOptions, EngineEstimatorOptions, + EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Pandas, + Severity, Verbose, Warnings, XReturn, XSelector, YReturn, YSelector, + bool_t, int_t, ) from atom.utils.utils import ( - crash, flt, lst, make_sklearn, n_cols, to_df, to_pandas, + check_dependency, crash, lst, make_sklearn, to_df, to_tabular, ) @@ -136,29 +132,18 @@ def engine(self, value: Engine): data=value.get("data", EngineTuple().data), estimator=value.get("estimator", EngineTuple().estimator), ) - else: - engine = value # type: ignore[assignment] - - if engine.data == "modin" and not ray.is_initialized(): - ray.init( - runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_Pandas__": "1"}}, - log_to_driver=False, - ) + elif isinstance(value, EngineTuple): + engine = value - # Update env variable to use for PandasModin in utils.py - os.environ["ATOM_DATA_ENGINE"] = engine.data + # Make sure the data engine library is installed + check_dependency(engine.data_engine.library) if engine.estimator == "sklearnex": - if not find_spec("sklearnex"): - raise ModuleNotFoundError( - "Failed to import scikit-learn-intelex. The library is " - "not installed. Note that the library only supports CPUs " - "with a x86 architecture." - ) - else: - import sklearnex + check_dependency("sklearnex") + import sklearnex + + sklearnex.set_config(self.device.lower() if self._gpu else "auto") - sklearnex.set_config(self.device.lower() if self._gpu else "auto") elif engine.estimator == "cuml": if not find_spec("cuml"): raise ModuleNotFoundError( @@ -186,10 +171,25 @@ def backend(self) -> Backend: @beartype def backend(self, value: Backend): if value == "ray": + check_dependency("ray") + import ray + from ray.util.joblib import register_ray + register_ray() # Register ray as joblib backend if not ray.is_initialized(): ray.init(log_to_driver=False) + elif value == "dask": + check_dependency("dask") + from dask.distributed import Client + + try: + Client.current() + except ValueError: + Client(processes=False) + + joblib.parallel_config(backend=value) + self._backend = value @property @@ -299,6 +299,12 @@ def experiment(self, value: str | None): self._experiment = value if value: if value.lower().startswith("dagshub:"): + check_dependency("dagshub") + check_dependency("requests") + import dagshub + import requests + from dagshub.auth.token_auth import HTTPBearerAuth + value = value[8:] # Drop dagshub: token = dagshub.auth.get_token() @@ -359,99 +365,24 @@ def _device_id(self) -> int: # Methods ====================================================== >> - def _inherit( - self, - obj: T_Estimator, fixed: tuple[str, ...] = (), - feature_names_out: FeatureNamesOut = "one-to-one", - ) -> T_Estimator: - """Inherit parameters from parent. - - Utility method to set the sp (seasonal period), n_jobs and - random_state parameters of an estimator (if available) equal - to that of this instance. If `obj` is a meta-estimator, it - also adjusts the parameters of the base estimator. - - Parameters - ---------- - obj: Estimator - Instance for which to change the parameters. - - fixed: tuple of str, default=() - Fixed parameters that should not be overriden. - - feature_names_out: "one-to-one", callable or None, default="one-to-one" - Determines the list of feature names that will be returned - by the `get_feature_names_out` method. - - - If None: The `get_feature_names_out` method is not defined. - - If "one-to-one": The output feature names will be equal to - the input feature names. - - If callable: Function that takes positional arguments self - and a sequence of input feature names. It must return a - sequence of output feature names. - - Returns - ------- - Estimator - Same object with changed parameters. - - """ - for p in obj.get_params(): - if p in fixed: - continue - elif match := re.search("^(n_jobs|random_state)$|__\1$", p): - obj.set_params(**{p: getattr(self, match.group())}) - elif re.search(r"^sp$|__sp$", p) and hasattr(self, "_config") and self._config.sp: - if self.multiple_seasonality: - obj.set_params(**{p: self._config.sp.sp}) - else: - obj.set_params(**{p: lst(self._config.sp.sp)[0]}) - - return make_sklearn(obj, feature_names_out=feature_names_out) - - def _get_est_class(self, name: str, module: str) -> type[Estimator]: - """Import a class from a module. - - When the import fails, for example, if atom uses sklearnex and - that's passed to a transformer, use sklearn's (default engine). - - Parameters - ---------- - name: str - Name of the class to get. - - module: str - Module from which to get the class. - - Returns - ------- - Estimator - Class of the estimator. - - """ - try: - mod = import_module(f"{self.engine.estimator}.{module}") - except (ModuleNotFoundError, AttributeError): - mod = import_module(f"sklearn.{module}") - - return make_sklearn(getattr(mod, name)) - @staticmethod @overload def _check_input( X: XSelector, y: Literal[None], - columns: Axes, - name: Literal[None], - ) -> tuple[DataFrame, None]: ... + *, + columns: Axes | None = ..., + name: str | Axes | None = ..., + ) -> tuple[pd.DataFrame, None]: ... @staticmethod @overload def _check_input( X: Literal[None], y: YSelector, - columns: Literal[None], - name: str | Sequence[str], + *, + columns: Axes | None = ..., + name: str | Axes | None = ..., ) -> tuple[None, Pandas]: ... @staticmethod @@ -459,134 +390,72 @@ def _check_input( def _check_input( X: XSelector, y: YSelector, + *, columns: Axes | None = ..., - name: str | Sequence[str] | None = ..., - ) -> tuple[DataFrame, Pandas]: ... + name: str | Axes | None = ..., + ) -> tuple[pd.DataFrame, Pandas]: ... @staticmethod def _check_input( X: XSelector | None = None, y: YSelector | None = None, + *, columns: Axes | None = None, - name: str | Sequence[str] | None = None, - ) -> tuple[DataFrame | None, Pandas | None]: + name: str | Axes | None = None, + ) -> tuple[pd.DataFrame | None, Pandas | None]: """Prepare the input data. - Convert X and y to pandas (if not already) and perform standard - compatibility checks (dimensions, length, indices, etc...). + Convert X and y to pandas and perform standard compatibility + checks (dimensions, length, indices, etc...). Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: int, str, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. - columns: sequence or None, default=None - Names of the features corresponding to `X`. If X already is a - dataframe, force feature order. If None and X is not a - dataframe, assign default feature names. + columns: sequence of str or None, default=None + Column names for the feature set. If None, default names + are used. name: str, sequence or None, default=None - Name of the target column(s) corresponding to y. If None and - y is not a pandas object, assign default target name. + Name of the target column(s). If None, a default name is + used. Returns ------- - dataframe or None - Feature dataset. Only returned if provided. + pd.DataFrame or None + Feature set. - series, dataframe or None - Target column corresponding to `X`. + pd.Series, pd.DataFrame or None + Target column(s) corresponding to `X`. """ - Xt: DataFrame | None = None - yt: Pandas | None = None - if X is None and y is None: raise ValueError("X and y can't be both None!") - elif X is not None: - Xt = to_df(deepcopy(X() if callable(X) else X), columns=columns) - - # If text dataset, change the name of the column to corpus - if list(Xt.columns) == ["x0"] and Xt[Xt.columns[0]].dtype == "object": - Xt = Xt.rename(columns={Xt.columns[0]: "corpus"}) - else: - # Convert all column names to str - Xt.columns = Xt.columns.astype(str) - - # No duplicate rows nor column names are allowed - if Xt.columns.duplicated().any(): - raise ValueError("Duplicate column names found in X.") - - # Reorder columns to original order - if columns is not None: - try: - Xt = Xt[list(columns)] # Force order determined by columns - except KeyError: - raise ValueError( - f"The features are different than seen at fit time. " - f"Features {set(Xt.columns) - set(columns)} are missing in X." - ) from None + else: + Xt = to_df(X() if callable(X) else X, columns=columns) # Prepare target column - if isinstance(y, (dict, *sequence_t, *dataframe_t)): - if isinstance(y, dict): - yt = to_df(deepcopy(y), index=getattr(Xt, "index", None)) - if n_cols(yt) == 1: - yt = yt.iloc[:, 0] # If y is one-dimensional, get series - - else: - # If X and y have different number of rows, try multioutput - if Xt is not None and len(Xt) != len(y): - try: - targets: list[Hashable] = [] - for col in y: - if col in Xt.columns: - targets.append(col) - elif isinstance(col, int_t): - if -Xt.shape[1] <= col < Xt.shape[1]: - targets.append(Xt.columns[int(col)]) - else: - raise IndexError( - "Invalid value for the y parameter. Value " - f"{col} is out of range for data with " - f"{Xt.shape[1]} columns." - ) - - Xt, yt = Xt.drop(columns=targets), Xt[targets] - - except (TypeError, IndexError, KeyError): - raise ValueError( - "X and y don't have the same number of rows," - f" got len(X)={len(Xt)} and len(y)={len(y)}." - ) from None - else: - yt = y - - default_cols = [f"y{i}" for i in range(n_cols(y))] - yt = to_pandas( - data=deepcopy(yt), - index=getattr(Xt, "index", None), - name=flt(name) if name is not None else "target", - columns=name if isinstance(name, sequence_t) else default_cols, - ) - - # Check X and y have the same indices - if Xt is not None and not Xt.index.equals(yt.index): - raise ValueError("X and y don't have the same indices!") + yt: Pandas | None + if y is None: + yt = None + elif isinstance(y, int_t): + if Xt is None: + raise ValueError("X can't be None when y is an int.") + Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]] elif isinstance(y, str): if Xt is not None: if y not in Xt.columns: @@ -596,15 +465,148 @@ def _check_input( else: raise ValueError("X can't be None when y is a string.") + else: + # If X and y have different number of rows, try multioutput + if Xt is not None and not isinstance(y, dict) and len(Xt) != len(y): + try: + targets: list[Hashable] = [] + for col in y: + if isinstance(col, str) and col in Xt.columns: + targets.append(col) + elif isinstance(col, int_t): + if -Xt.shape[1] <= col < Xt.shape[1]: + targets.append(Xt.columns[int(col)]) + else: + raise IndexError( + "Invalid value for the y parameter. Value " + f"{col} is out of range for data with " + f"{Xt.shape[1]} columns." + ) + + Xt, yt = Xt.drop(columns=targets), Xt[targets] + + except (TypeError, IndexError, KeyError): + raise ValueError( + "X and y don't have the same number of rows," + f" got len(X)={len(Xt)} and len(y)={len(y)}." + ) from None + else: + yt = to_tabular(y, index=getattr(Xt, "index", None), columns=name) - elif isinstance(y, int_t): - if Xt is None: - raise ValueError("X can't be None when y is an int.") - - Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]] + # Check X and y have the same indices + if Xt is not None and not Xt.index.equals(yt.index): + raise ValueError("X and y don't have the same indices!") return Xt, yt + @overload + def _convert(self, obj: Literal[None]) -> None: ... + + @overload + def _convert(self, obj: pd.DataFrame) -> XReturn: ... + + @overload + def _convert(self, obj: pd.Series) -> YReturn: ... + + def _convert(self, obj: Pandas | None) -> YReturn | None: + """Convert data to the type set in the data engine. + + Non-pandas types are returned as is. + + Parameters + ---------- + obj: object + Object to convert. + + Returns + ------- + object + Converted data or unchanged object. + + """ + # Only apply transformations when the engine is defined + if hasattr(self, "_engine") and isinstance(obj, pd.Series | pd.DataFrame): + return self._engine.data_engine.convert(obj) + else: + return obj + + def _get_est_class(self, name: str, module: str) -> type[Estimator]: + """Import a class from a module. + + When the import fails, for example, if atom uses sklearnex and + that's passed to a transformer, use sklearn's (default engine). + + Parameters + ---------- + name: str + Name of the class to get. + + module: str + Module from which to get the class. + + Returns + ------- + Estimator + Class of the estimator. + + """ + try: + mod = import_module(f"{self.engine.estimator}.{module}") + except (ModuleNotFoundError, AttributeError): + mod = import_module(f"sklearn.{module}") + + return make_sklearn(getattr(mod, name)) + + def _inherit( + self, + obj: T_Estimator, fixed: tuple[str, ...] = (), + feature_names_out: FeatureNamesOut = "one-to-one", + ) -> T_Estimator: + """Inherit parameters from parent. + + Utility method to set the sp (seasonal period), n_jobs and + random_state parameters of an estimator (if available) equal + to that of this instance. If `obj` is a meta-estimator, it + also adjusts the parameters of the base estimator. + + Parameters + ---------- + obj: Estimator + Instance for which to change the parameters. + + fixed: tuple of str, default=() + Fixed parameters that should not be overriden. + + feature_names_out: "one-to-one", callable or None, default="one-to-one" + Determines the list of feature names that will be returned + by the `get_feature_names_out` method. + + - If None: The `get_feature_names_out` method is not defined. + - If "one-to-one": The output feature names will be equal to + the input feature names. + - If callable: Function that takes positional arguments self + and a sequence of input feature names. It must return a + sequence of output feature names. + + Returns + ------- + Estimator + Same object with changed parameters. + + """ + for p in obj.get_params(): + if p in fixed: + continue + elif match := re.search("^(n_jobs|random_state)$|__\1$", p): + obj.set_params(**{p: getattr(self, match.group())}) + elif re.search(r"^sp$|__sp$", p) and hasattr(self, "_config") and self._config.sp: + if self.multiple_seasonality: + obj.set_params(**{p: self._config.sp.sp}) + else: + obj.set_params(**{p: lst(self._config.sp.sp)[0]}) + + return make_sklearn(obj, feature_names_out=feature_names_out) + @crash def _log(self, msg: str, level: Int = 0, severity: Severity = "info"): """Print message and save to log file. diff --git a/atom/branch/__init__.py b/atom/branch/__init__.py deleted file mode 100644 index dd6f3adc1..000000000 --- a/atom/branch/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Automated Tool for Optimized Modeling (ATOM). - -Author: Mavs -Description: Module for branches. - -""" - -from atom.branch.branch import Branch -from atom.branch.branchmanager import BranchManager diff --git a/atom/data/__init__.py b/atom/data/__init__.py new file mode 100644 index 000000000..236e72416 --- /dev/null +++ b/atom/data/__init__.py @@ -0,0 +1,10 @@ +"""Automated Tool for Optimized Modeling (ATOM). + +Author: Mavs +Description: Module for branches. + +""" + +from atom.data.branch import Branch +from atom.data.branchmanager import BranchManager +from atom.data.dataengines import DATA_ENGINES diff --git a/atom/branch/branch.py b/atom/data/branch.py similarity index 84% rename from atom/branch/branch.py rename to atom/data/branch.py index 8481386a8..d2f1f20b4 100644 --- a/atom/branch/branch.py +++ b/atom/data/branch.py @@ -15,6 +15,7 @@ from warnings import filterwarnings import dill as pickle +import pandas as pd from beartype import beartype from beartype.roar import BeartypeDecorHintPep585DeprecationWarning from joblib.memory import Memory @@ -22,12 +23,13 @@ from atom.pipeline import Pipeline from atom.utils.types import ( - Bool, ColumnSelector, DataFrame, Index, Int, IntLargerEqualZero, Pandas, - RowSelector, Scalar, Sequence, TargetSelector, TargetsSelector, XSelector, - YSelector, dataframe_t, index_t, int_t, segment_t, series_t, + Bool, ColumnSelector, Int, IntLargerEqualZero, Pandas, RowSelector, Scalar, + TargetSelector, TargetsSelector, XConstructor, XDatasets, YConstructor, + YDatasets, int_t, segment_t, ) from atom.utils.utils import ( - DataContainer, bk, flt, get_cols, lst, merge, to_pandas, + DataContainer, check_scaling, flt, get_col_names, get_cols, lst, merge, + to_tabular, ) @@ -58,16 +60,16 @@ class Branch: name: str Name of the branch. - memory: str, [Memory][joblibmemory] or None, default=None - Memory object for pipeline caching and to store the data when - the branch is inactive. - data: DataContainer or None, default=None Data for the branch. - holdout: dataframe or None, default=None + holdout: pd.DataFrame or None, default=None Holdout data set. + memory: str, [Memory][joblibmemory] or None, default=None + Memory object for pipeline caching and to store the data when + the branch is inactive. + See Also -------- atom.branch:BranchManager @@ -98,12 +100,33 @@ class Branch: """ + _shared_attrs = ( + "pipeline", + "mapping", + "dataset", + "train", + "test", + "X", + "y", + "X_train", + "y_train", + "X_test", + "y_test", + "shape", + "columns", + "n_columns", + "features", + "n_features", + "target", + ) + def __init__( self, name: str, - memory: str | Memory | None = None, data: DataContainer | None = None, - holdout: DataFrame | None = None, + holdout: pd.DataFrame | None = None, + *, + memory: str | Memory | None = None, ): self.name = name self.memory = check_memory(memory) @@ -161,14 +184,16 @@ def name(self, value: str): # Data properties ============================================== >> - def _check_setter( - self, - name: str, - value: Sequence[Scalar | str] | XSelector, - ) -> Pandas: + @overload + def _check_setter(self, name: XDatasets, value: YConstructor) -> pd.DataFrame: ... + + @overload + def _check_setter(self, name: YDatasets, value: YConstructor) -> pd.Series: ... + + def _check_setter(self, name: XDatasets | YDatasets, value: YConstructor) -> Pandas: """Check the data set's setter property. - Convert the property to a pandas object and compare with the + Convert the property to a 'pandas' object and compare with the rest of the dataset, to check if it has the right indices and dimensions. @@ -182,7 +207,7 @@ def _check_setter( Returns ------- - series or dataframe + pd.Series or pd.DataFrame Data set. """ @@ -226,11 +251,13 @@ def counter(name: str, dim: str) -> str | None: if under_name := counter(name, "under"): under = getattr(self, under_name) - obj = to_pandas( + if (columns := get_col_names(value)) is None: + columns = get_col_names(under) if under_name else None + + obj = to_tabular( data=value, index=side.index if side_name else None, - name=getattr(under, "name", "target") if under_name else "target", - columns=getattr(under, "columns", None) if under_name else None, + columns=columns, ) if side_name: # Check for equal rows @@ -246,7 +273,7 @@ def counter(name: str, dim: str) -> str | None: ) if under_name: # Check for equal columns - if isinstance(obj, series_t): + if isinstance(obj, pd.Series): if obj.name != under.name: raise ValueError( f"{name} and {under_name} must have the " @@ -292,38 +319,38 @@ def mapping(self) -> dict[str, dict[Hashable, Scalar]]: return self._mapping @property - def dataset(self) -> DataFrame: + def dataset(self) -> pd.DataFrame: """Complete data set.""" return self._data.data @dataset.setter - def dataset(self, value: XSelector): + def dataset(self, value: XConstructor): self._data.data = self._check_setter("dataset", value) @property - def train(self) -> DataFrame: + def train(self) -> pd.DataFrame: """Training set.""" return self._data.data.loc[self._data.train_idx] @train.setter - def train(self, value: XSelector): + def train(self, value: XConstructor): df = self._check_setter("train", value) - self._data.data = bk.concat([df, self.test]) + self._data.data = pd.concat([df, self.test]) self._data.train_idx = df.index @property - def test(self) -> DataFrame: + def test(self) -> pd.DataFrame: """Test set.""" return self._data.data.loc[self._data.test_idx] @test.setter - def test(self, value: XSelector): + def test(self, value: XConstructor): df = self._check_setter("test", value) - self._data.data = bk.concat([self.train, df]) + self._data.data = pd.concat([self.train, df]) self._data.test_idx = df.index @cached_property - def holdout(self) -> DataFrame | None: + def holdout(self) -> pd.DataFrame | None: """Holdout set.""" if self._holdout is not None: return merge( @@ -336,12 +363,12 @@ def holdout(self) -> DataFrame | None: return None @property - def X(self) -> DataFrame: + def X(self) -> pd.DataFrame: """Feature set.""" return self._data.data[self.features] @X.setter - def X(self, value: XSelector): + def X(self, value: XConstructor): df = self._check_setter("X", value) self._data.data = merge(df, self.y) @@ -351,19 +378,19 @@ def y(self) -> Pandas: return self._data.data[self.target] @y.setter - def y(self, value: YSelector): + def y(self, value: YConstructor): series = self._check_setter("y", value) self._data.data = merge(self.X, series) @property - def X_train(self) -> DataFrame: + def X_train(self) -> pd.DataFrame: """Features of the training set.""" return self.train[self.features] @X_train.setter - def X_train(self, value: XSelector): + def X_train(self, value: XConstructor): df = self._check_setter("X_train", value) - self._data.data = bk.concat([merge(df, self.y_train), self.test]) + self._data.data = pd.concat([merge(df, self.y_train), self.test]) @property def y_train(self) -> Pandas: @@ -371,19 +398,19 @@ def y_train(self) -> Pandas: return self.train[self.target] @y_train.setter - def y_train(self, value: YSelector): + def y_train(self, value: YConstructor): series = self._check_setter("y_train", value) - self._data.data = bk.concat([merge(self.X_train, series), self.test]) + self._data.data = pd.concat([merge(self.X_train, series), self.test]) @property - def X_test(self) -> DataFrame: + def X_test(self) -> pd.DataFrame: """Features of the test set.""" return self.test[self.features] @X_test.setter - def X_test(self, value: XSelector): + def X_test(self, value: XConstructor): df = self._check_setter("X_test", value) - self._data.data = bk.concat([self.train, merge(df, self.y_test)]) + self._data.data = pd.concat([self.train, merge(df, self.y_test)]) @property def y_test(self) -> Pandas: @@ -391,9 +418,9 @@ def y_test(self) -> Pandas: return self.test[self.target] @y_test.setter - def y_test(self, value: YSelector): + def y_test(self, value: YConstructor): series = self._check_setter("y_test", value) - self._data.data = bk.concat([self.train, merge(self.X_test, series)]) + self._data.data = pd.concat([self.train, merge(self.X_test, series)]) @property def shape(self) -> tuple[Int, Int]: @@ -401,49 +428,61 @@ def shape(self) -> tuple[Int, Int]: return self.dataset.shape @property - def columns(self) -> Index: + def columns(self) -> list[str]: """Name of all the columns.""" - return self.dataset.columns + return list(self.dataset.columns) @property - def n_columns(self) -> Int: + def n_columns(self) -> int: """Number of columns.""" return len(self.columns) @property - def features(self) -> Index: + def features(self) -> list[str]: """Name of the features.""" - return self.columns[:-self._data.n_cols] + return list(self.columns[:-self._data.n_targets]) @property - def n_features(self) -> Int: + def n_features(self) -> int: """Number of features.""" return len(self.features) @property def target(self) -> str | list[str]: """Name of the target column(s).""" - return flt(list(self.columns[-self._data.n_cols:])) + return flt(list(self.columns[-self._data.n_targets:])) @property - def _all(self) -> DataFrame: + def _all(self) -> pd.DataFrame: """Dataset + holdout. Note that calling this property triggers the holdout set calculation. """ - return bk.concat([self.dataset, self.holdout]) + return pd.concat([self.dataset, self.holdout]) # Utility methods ============================================== >> + def _get_shared_attrs(self) -> list[str]: + """Get the attributes that can be accessed from a runner. + + Returns + ------- + list of str + Instance attributes. + + """ + instance_vars = [x for x in vars(self) if not x.startswith("_") and x.endswith("_")] + return list(self._shared_attrs) + instance_vars + @overload def _get_rows( self, rows: RowSelector, *, return_X_y: Literal[False] = ..., - ) -> DataFrame: ... + ) -> pd.DataFrame: ... @overload def _get_rows( @@ -451,14 +490,14 @@ def _get_rows( rows: RowSelector, *, return_X_y: Literal[True], - ) -> tuple[DataFrame, Pandas]: ... + ) -> tuple[pd.DataFrame, Pandas]: ... def _get_rows( self, rows: RowSelector, *, return_X_y: Bool = False, - ) -> DataFrame | tuple[DataFrame, Pandas]: + ) -> pd.DataFrame | tuple[pd.DataFrame, Pandas]: """Get a subset of the rows. Rows can be selected by name, index, data set or regex pattern. @@ -479,10 +518,10 @@ def _get_rows( Returns ------- - dataframe + pd.DataFrame Subset of rows. - series or dataframe + pd.Series or pd.Dataframe Subset of target column. Only returned if return_X_y=True. """ @@ -490,9 +529,9 @@ def _get_rows( inc: list[Hashable] = [] exc: list[Hashable] = [] - if isinstance(rows, dataframe_t): + if isinstance(rows, pd.DataFrame): inc.extend(rows.index) - elif isinstance(rows, index_t): + elif isinstance(rows, pd.Index): inc.extend(rows) elif isinstance(rows, segment_t): inc.extend(_all.index[rows]) @@ -541,10 +580,12 @@ def _get_rows( # If rows were excluded with `!`, select all but those inc = list(_all.index[~_all.index.isin(exc)]) + rows_c = _all.loc[inc] + if return_X_y: - return _all.loc[inc, self.features], _all.loc[inc, self.target] # type: ignore[index] + return rows_c[self.features], rows_c[self.target] else: - return self._all.loc[inc] + return rows_c def _get_columns( self, @@ -590,7 +631,7 @@ def _get_columns( return list(df.select_dtypes(include=["number"]).columns) else: return list(df.columns) - elif isinstance(columns, dataframe_t): + elif isinstance(columns, pd.DataFrame): inc.extend(list(columns.columns)) elif isinstance(columns, segment_t): inc.extend(list(df.columns[columns])) @@ -755,7 +796,7 @@ def get_class( if only_columns and not isinstance(target, tuple): return get_column(target) elif isinstance(target, tuple): - if not isinstance(self.y, dataframe_t): + if not isinstance(self.y, pd.DataFrame): raise ValueError( f"Invalid value for the target parameter, got {target}. " "A tuple is only accepted for multioutput tasks." @@ -831,3 +872,27 @@ def store(self, *, assign: Bool = True): if assign: self._container = None + + def check_scaling(self) -> bool: + """Whether the feature set is scaled. + + A data set is considered scaled when it has mean~0 and std~1, + or when there is a scaler in the pipeline. Categorical and + binary columns (only zeros and ones) are excluded from the + calculation. + + Returns + ------- + bool + Whether the feature set is scaled. + + """ + if any("scaler" in name.lower() for name in self.pipeline.named_steps): + return True + + df = self.X.loc[:, (~self.X.isin([0, 1])).any(axis=0)] # Remove binary columns + + if df.empty: # All columns are binary -> no scaling needed + return True + else: + return check_scaling(df) diff --git a/atom/branch/branchmanager.py b/atom/data/branchmanager.py similarity index 94% rename from atom/branch/branchmanager.py rename to atom/data/branchmanager.py index 0d2a36f7d..66f7ed3de 100644 --- a/atom/branch/branchmanager.py +++ b/atom/data/branchmanager.py @@ -11,12 +11,13 @@ from collections.abc import Iterator from copy import copy, deepcopy +import pandas as pd from beartype import beartype from joblib.memory import Memory from sklearn.utils.validation import check_memory -from atom.branch.branch import Branch -from atom.utils.types import Bool, DataFrame, Int +from atom.data.branch import Branch +from atom.utils.types import Bool, Int from atom.utils.utils import ClassMap, DataContainer @@ -99,7 +100,7 @@ def __repr__(self) -> str: """Print containing branches.""" return f"BranchManager([{', '.join(self.branches.keys())}], og={self.og.name})" - def __len__(self) -> Int: + def __len__(self) -> int: """Get the number of branches in the manager.""" return len(self.branches) @@ -212,9 +213,11 @@ def add(self, name: str, parent: Branch | None = None): if parent: self._copy_from_parent(self.current, parent) - def fill(self, data: DataContainer, holdout: DataFrame | None = None): + def fill(self, data: DataContainer, holdout: pd.DataFrame | None = None): """Fill the current branch with data. + This call resets the cached holdout calculation. + Parameters ---------- data: DataContainer @@ -225,7 +228,10 @@ def fill(self, data: DataContainer, holdout: DataFrame | None = None): """ self.current._container = data - self.current._holdout = holdout + if holdout is not None: + self.current._holdout = holdout + + self.current.__dict__.pop("holdout", None) def reset(self, *, hard: Bool = False): """Reset this instance to its initial state. diff --git a/atom/data/dataengines.py b/atom/data/dataengines.py new file mode 100644 index 000000000..7d5d4500c --- /dev/null +++ b/atom/data/dataengines.py @@ -0,0 +1,206 @@ +"""Automated Tool for Optimized Modeling (ATOM). + +Author: Mavs +Description: Module containing the data engines. + +""" + +from __future__ import annotations + +from abc import ABCMeta, abstractmethod +from typing import TYPE_CHECKING + +import numpy as np +import pandas as pd + +from atom.utils.types import Any, Pandas + + +if TYPE_CHECKING: + import dask.dataframe as dd + import modin.pandas as md + import polars as pl + import pyarrow as pa + import pyspark.pandas as ps + import pyspark.sql as psql + + +class DataEngine(metaclass=ABCMeta): + """Abstract class for data engines. + + Data engines convert a pandas object to a specific type. + The type is determined by the data engine. + + """ + + @staticmethod + @abstractmethod + def convert(obj: Pandas) -> Any: + """Convert to data engine output types.""" + + +class NumpyEngine(DataEngine): + """Numpy data engine.""" + + library = "numpy" + + @staticmethod + def convert(obj: Pandas) -> np.ndarray: + """Convert to numpy array.""" + return obj.to_numpy() + + +class PandasEngine(DataEngine): + """Pandas numpy data engine.""" + + library = "pandas" + + @staticmethod + def convert(obj: Pandas) -> Pandas: + """Leave as is.""" + return obj + + +class PandasPyarrowEngine(DataEngine): + """Pandas pyarrow data engine.""" + + library = "pandas" + + @staticmethod + def convert(obj: Pandas) -> Pandas: + """Convert to pyarrow dtypes.""" + from pyarrow import from_numpy_dtype + + if isinstance(obj, pd.DataFrame): + return obj.astype( + { + c: pd.ArrowDtype(from_numpy_dtype(getattr(d, "numpy_dtype", d))) + for c, d in obj.dtypes.items() + } + ) + else: + return obj.astype( + pd.ArrowDtype(from_numpy_dtype(obj.dtype)) + if isinstance(obj.dtype, np.dtype) else obj.dtype + ) + + +class PolarsEngine(DataEngine): + """Polars data engine.""" + + library = "polars" + + @staticmethod + def convert(obj: Pandas) -> pl.Series | pl.DataFrame: + """Convert to polars objects.""" + import polars as pl + + if isinstance(obj, pd.DataFrame): + return pl.DataFrame(obj) + else: + return pl.Series(obj) + + +class PolarsLazyEngine(DataEngine): + """Polars lazy data engine.""" + + library = "polars" + + @staticmethod + def convert(obj: Pandas) -> pl.Series | pl.LazyFrame: + """Convert to lazy polars objects.""" + import polars as pl + + if isinstance(obj, pd.DataFrame): + return pl.LazyFrame(obj) + else: + return pl.Series(obj) + + +class PyArrowEngine(DataEngine): + """PyArrow data engine.""" + + library = "pyarrow" + + @staticmethod + def convert(obj: Pandas) -> pa.Array | pa.Table: + """Convert to pyarrow objects.""" + import pyarrow as pa + + if isinstance(obj, pd.DataFrame): + return pa.Table.from_pandas(obj) + else: + return pa.Array.from_pandas(obj) + + +class ModinEngine(DataEngine): + """Modin data engine.""" + + library = "modin" + + @staticmethod + def convert(obj: Pandas) -> md.Series | md.DataFrame: + """Convert to modin objects.""" + import modin.pandas as md + + if isinstance(obj, pd.DataFrame): + return md.DataFrame(obj) + else: + return md.Series(obj) + + +class DaskEngine(DataEngine): + """Dask data engine.""" + + library = "dask" + + @staticmethod + def convert(obj: Pandas) -> dd.Series | dd.DataFrame: + """Convert to dask objects.""" + import dask.dataframe as dd + + return dd.from_pandas(obj, npartitions=int(max(1, len(obj) // 1e6))) + + +class PySparkEngine(DataEngine): + """PySpark data engine.""" + + library = "pyspark" + + @staticmethod + def convert(obj: Pandas) -> psql.DataFrame: + """Convert to pyspark objects.""" + from pyspark.sql import SparkSession + + spark = SparkSession.builder.appName("atom-ml").getOrCreate() + return spark.createDataFrame(obj) + + +class PySparkPandasEngine(DataEngine): + """PySpark data engine with pandas API.""" + + library = "pyspark" + + @staticmethod + def convert(obj: Pandas) -> ps.Series | ps.DataFrame: + """Convert to pyspark objects.""" + import pyspark.pandas as ps + + if isinstance(obj, pd.DataFrame): + return ps.DataFrame(obj) + else: + return ps.Series(obj) + + +DATA_ENGINES = { + "numpy": NumpyEngine, + "pandas": PandasEngine, + "pandas-pyarrow": PandasPyarrowEngine, + "polars": PolarsEngine, + "polars-lazy": PolarsLazyEngine, + "pyarrow": PyArrowEngine, + "modin": ModinEngine, + "dask": DaskEngine, + "pyspark": PySparkEngine, + "pyspark-pandas": PySparkPandasEngine, +} diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index 165e31475..2861c0326 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -10,8 +10,7 @@ import re from collections import defaultdict from collections.abc import Hashable -from typing import Any, Literal, TypeVar -from unittest.mock import patch +from typing import Any, Literal, TypeVar, overload import numpy as np import pandas as pd @@ -40,33 +39,31 @@ from sklearn.compose import ColumnTransformer from sklearn.experimental import enable_iterative_imputer # noqa: F401 from sklearn.impute import IterativeImputer, KNNImputer -from sklearn.utils._set_output import _SetOutputMixin from sklearn.utils.validation import _check_feature_names_in from sktime.transformations.series.detrend import ( ConditionalDeseasonalizer, Deseasonalizer, Detrender, ) -from sktime.transformations.series.impute import Imputer as sktimeImputer +from sktime.transformations.series.impute import Imputer as SktimeImputer from typing_extensions import Self from atom.basetransformer import BaseTransformer from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING -from atom.utils.patches import wrap_method_output from atom.utils.types import ( - Bins, Bool, CategoricalStrats, DataFrame, DiscretizerStrats, Engine, - EngineTuple, Estimator, FloatLargerZero, Int, IntLargerEqualZero, - IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, - Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels, - Sequence, Series, Transformer, Verbose, XConstructor, YConstructor, - dataframe_t, sequence_t, series_t, + Bins, Bool, CategoricalStrats, DiscretizerStrats, Engine, + EngineDataOptions, EngineTuple, Estimator, FloatLargerZero, Int, + IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, + NumericalStrats, Predictor, PrunerStrats, Scalar, ScalerStrats, + SeasonalityModels, Sequence, Transformer, Verbose, XConstructor, XReturn, + YConstructor, YReturn, sequence_t, ) from atom.utils.utils import ( - Goal, bk, check_is_fitted, composed, crash, get_col_order, get_cols, it, - lst, make_sklearn, merge, method_to_log, n_cols, replace_missing, sign, - to_df, to_series, variable_return, wrap_transformer_methods, + Goal, check_is_fitted, get_col_names, get_col_order, get_cols, it, lst, + make_sklearn, merge, n_cols, replace_missing, sign, to_df, to_series, + to_tabular, variable_return, ) -T = TypeVar("T", bound=Transformer) +T_Transformer = TypeVar("T_Transformer", bound=Transformer) @beartype @@ -77,21 +74,12 @@ class TransformerMixin(BaseEstimator, BaseTransformer): - Accounts for the transformation of y. - Always add a fit method. - - Wraps the fit method with a data check. - - Wraps transforming methods with fit and data check. + - Wraps the fit method with attributes and a data check. + - Wraps transforming methods a data check. - Maintains internal attributes when cloned. """ - def __init_subclass__(cls, **kwargs): - """Wrap transformer methods to apply data and fit check.""" - for k in ("fit", "transform", "inverse_transform"): - setattr(cls, k, wrap_transformer_methods(getattr(cls, k))) - - # Patch to avoid errors for transformers that allow passing only y - with patch("sklearn.utils._set_output._wrap_method_output", wrap_method_output): - super().__init_subclass__(**kwargs) - def __repr__(self, N_CHAR_MAX: Int = 700) -> str: """Drop named tuples if default parameters from string representation.""" out = super().__repr__(N_CHAR_MAX) @@ -107,7 +95,7 @@ def __repr__(self, N_CHAR_MAX: Int = 700) -> str: return out - def __sklearn_clone__(self: T) -> T: + def __sklearn_clone__(self: T_Transformer) -> T_Transformer: """Wrap cloning method to attach internal attributes.""" cloned = _clone_parametrized(self) @@ -117,7 +105,6 @@ def __sklearn_clone__(self: T) -> T: return cloned - @composed(crash, method_to_log) def fit( self, X: XConstructor | None = None, @@ -132,20 +119,11 @@ def fit( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. - - y: int, str, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. + `X` is ignored. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe-like: Target columns with shape=(n_samples, - n_targets) for multioutput tasks. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. If None, `y` is + ignored. **fit_params Additional keyword arguments for the fit method. @@ -156,37 +134,56 @@ def fit( Estimator instance. """ + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + self._log(f"Fitting {self.__class__.__name__}...", 1) return self - @composed(crash, method_to_log) + @overload + def fit_transform( + self, + X: Literal[None], + y: YConstructor, + **fit_params, + ) -> YReturn: ... + + @overload + def fit_transform( + self, + X: XConstructor, + y: Literal[None] = ..., + **fit_params, + ) -> XReturn: ... + + @overload + def fit_transform( + self, + X: XConstructor, + y: YConstructor, + **fit_params, + ) -> tuple[XReturn, YReturn]: ... + def fit_transform( self, X: XConstructor | None = None, y: YConstructor | None = None, **fit_params, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Fit to data, then transform it. Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. - - y: int, str, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. + `X` is ignored. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe-like: Target columns with shape=(n_samples, - n_targets) for multioutput tasks. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. If None, `y` is + ignored. **fit_params Additional keyword arguments for the fit method. @@ -202,12 +199,36 @@ def fit_transform( """ return self.fit(X, y, **fit_params).transform(X, y) - @composed(crash, method_to_log) + @overload + def inverse_transform( + self, + X: Literal[None], + y: YConstructor, + **fit_params, + ) -> YReturn: ... + + @overload + def inverse_transform( + self, + X: XConstructor, + y: Literal[None] = ..., + **fit_params, + ) -> XReturn: ... + + @overload + def inverse_transform( + self, + X: XConstructor, + y: YConstructor, + **fit_params, + ) -> tuple[XReturn, YReturn]: ... + def inverse_transform( self, X: XConstructor | None = None, y: YConstructor | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + **fit_params, + ) -> YReturn | tuple[XReturn, YReturn]: """Do nothing. Returns the input unchanged. Implemented for continuity of the @@ -217,20 +238,11 @@ def inverse_transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. - y: int, str, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe-like: Target columns with shape=(n_samples, - n_targets) for multioutput tasks. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. If None, `y` is + ignored. Returns ------- @@ -238,14 +250,58 @@ def inverse_transform( Feature set. Only returned if provided. series or dataframe - Target column. Only returned if provided. + Target column(s). Only returned if provided. """ - return variable_return(X, y) + check_is_fitted(self) + + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + + return variable_return(self._convert(Xt), self._convert(yt)) + + def set_output(self, *, transform: EngineDataOptions | None = None) -> Self: + """Set output container. + + See sklearn's [user guide][set_output] on how to use the + `set_output` API. See [here][data-acceleration] a description + of the choices. + + Parameters + ---------- + transform: str or None, default=None + Configure the output of the `transform`, `fit_transform`, + and `inverse_transform` method. If None, the configuration + is not changed. Choose from: + + - "numpy" + - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" + - "pyarrow" + - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" + + Returns + ------- + Self + Estimator instance. + + """ + if not hasattr(self, "_engine"): + self.engine = EngineTuple() + + if transform is not None: + self.engine = EngineTuple(estimator=self.engine.estimator, data=transform) + + return self @beartype -class Balancer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Balancer(TransformerMixin, OneToOneFeatureMixin): """Balance the number of samples per class in the target column. When oversampling, the newly created samples have an increasing @@ -364,8 +420,23 @@ def __init__( self.strategy = strategy self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas = -1) -> Self: + def _log_changes(self, y: pd.Series): + """Print the changes per target class. + + Parameters + ---------- + y: pd.Series + Target column. + + """ + for key, value in self.mapping_.items(): + diff = self._counts[key] - np.sum(y == value) + if diff > 0: + self._log(f" --> Removing {diff} samples from class {key}.", 2) + elif diff < 0: + self._log(f" --> Adding {-diff} samples to class {key}.", 2) + + def fit(self, X: XConstructor, y: YConstructor) -> Self: """Fit to data. Parameters @@ -373,31 +444,28 @@ def fit(self, X: DataFrame, y: Pandas = -1) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict or sequence, default=-1 + y: sequence Target column corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. - Returns ------- Self Estimator instance. """ - if isinstance(y, series_t): - self.target_names_in_ = np.array([y.name]) + Xt = to_df(X) + yt = to_tabular(y, index=Xt.index) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + + if isinstance(yt, pd.Series): + self.target_names_in_ = np.array([yt.name]) else: raise ValueError("The Balancer class does not support multioutput tasks.") + # ClusterCentroids is unavailable since it has no sample_indices_ strategies = { - # clustercentroids=ClusterCentroids, # noqa: ERA001 (has no sample_indices_) "condensednearestneighbour": CondensedNearestNeighbour, "editednearestneighborus": EditedNearestNeighbours, "repeatededitednearestneighbours": RepeatedEditedNearestNeighbours, @@ -440,21 +508,20 @@ def fit(self, X: DataFrame, y: Pandas = -1) -> Self: # Create dict of class counts in y if not hasattr(self, "mapping_"): - self.mapping_ = {str(v): v for v in y.sort_values().unique()} + self.mapping_ = {str(v): v for v in yt.sort_values().unique()} self._counts = {} for key, value in self.mapping_.items(): - self._counts[key] = np.sum(y == value) + self._counts[key] = np.sum(yt == value) - self._estimator = estimator.fit(X, y) + self._estimator = estimator.fit(Xt, yt) # Add the estimator as attribute to the instance setattr(self, f"{estimator.__class__.__name__.lower()}_", self._estimator) return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas = -1) -> tuple[DataFrame, Series]: + def transform(self, X: XConstructor, y: YConstructor) -> tuple[XReturn, YReturn]: """Balance the data. Parameters @@ -462,13 +529,9 @@ def transform(self, X: DataFrame, y: Pandas = -1) -> tuple[DataFrame, Series]: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str or sequence, default=-1 + y: sequence Target column corresponding to `X`. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - Else: Array with shape=(n_samples,) to use as target. - Returns ------- dataframe @@ -478,79 +541,74 @@ def transform(self, X: DataFrame, y: Pandas = -1) -> tuple[DataFrame, Series]: Transformed target column. """ + check_is_fitted(self) - def log_changes(y): - """Print the changes per target class.""" - for key, value in self.mapping_.items(): - diff = self._counts[key] - np.sum(y == value) - if diff > 0: - self._log(f" --> Removing {diff} samples from class {key}.", 2) - elif diff < 0: - self._log(f" --> Adding {-diff} samples to class {key}.", 2) + Xt = to_df(X, columns=self.feature_names_in_) + yt = to_series(y, index=Xt.index, name=self.target_names_in_[0]) # type: ignore[arg-type] if "over_sampling" in self._estimator.__module__: self._log(f"Oversampling with {self._estimator.__class__.__name__}...", 1) - index = X.index # Save indices for later reassignment - X, y = self._estimator.fit_resample(X, y) + index = Xt.index # Save indices for later reassignment + Xt, yt = self._estimator.fit_resample(Xt, yt) # Create indices for the new samples n_idx: list[int | str] if index.dtype.kind in "ifu": - n_idx = list(range(max(index) + 1, max(index) + len(X) - len(index) + 1)) + n_idx = list(range(max(index) + 1, max(index) + len(Xt) - len(index) + 1)) else: n_idx = [ f"{self._estimator.__class__.__name__.lower()}_{i}" - for i in range(1, len(X) - len(index) + 1) + for i in range(1, len(Xt) - len(index) + 1) ] # Assign the old + new indices - X.index = list(index) + list(n_idx) - y.index = list(index) + list(n_idx) + Xt.index = pd.Index(list(index) + n_idx) + yt.index = pd.Index(list(index) + n_idx) - log_changes(y) + self._log_changes(yt) elif "under_sampling" in self._estimator.__module__: self._log(f"Undersampling with {self._estimator.__class__.__name__}...", 1) - self._estimator.fit_resample(X, y) + self._estimator.fit_resample(Xt, yt) # Select chosen rows (imblearn doesn't return them in order) - samples = sorted(self._estimator.sample_indices_) - X, y = X.iloc[samples], y.iloc[samples] # type: ignore[call-overload] + samples = np.asarray(sorted(self._estimator.sample_indices_)) + Xt, yt = Xt.iloc[samples], yt.iloc[samples] - log_changes(y) + self._log_changes(yt) elif "combine" in self._estimator.__module__: self._log(f"Balancing with {self._estimator.__class__.__name__}...", 1) - index = X.index - X_new, y_new = self._estimator.fit_resample(X, y) + index = Xt.index + X_new, y_new = self._estimator.fit_resample(Xt, yt) # Select rows kept by the undersampler if self._estimator.__class__.__name__ == "SMOTEENN": - samples = sorted(self._estimator.enn_.sample_indices_) + samples = np.asarray(sorted(self._estimator.enn_.sample_indices_)) elif self._estimator.__class__.__name__ == "SMOTETomek": - samples = sorted(self._estimator.tomek_.sample_indices_) + samples = np.asarray(sorted(self._estimator.tomek_.sample_indices_)) # Select the remaining samples from the old dataframe - o_samples = [s for s in samples if s < len(X)] - X, y = X.iloc[o_samples], y.iloc[o_samples] # type: ignore[call-overload] + o_samples = [s for s in samples if s < len(Xt)] + Xt, yt = Xt.iloc[o_samples], yt.iloc[o_samples] # type: ignore[call-overload] # Create indices for the new samples if index.dtype.kind in "ifu": - n_idx = list(range(max(index) + 1, max(index) + len(X_new) - len(X) + 1)) + n_idx = list(range(max(index) + 1, max(index) + len(X_new) - len(Xt) + 1)) else: n_idx = [ f"{self._estimator.__class__.__name__.lower()}_{i}" - for i in range(1, len(X_new) - len(X) + 1) + for i in range(1, len(X_new) - len(Xt) + 1) ] # Select the new samples and assign the new indices X_new = X_new.iloc[-len(X_new) + len(o_samples):] - X_new.index = n_idx + X_new.index = pd.Index(n_idx) y_new = y_new.iloc[-len(y_new) + len(o_samples):] - y_new.index = n_idx + y_new.index = pd.Index(n_idx) # First, output the samples created for key, value in self.mapping_.items(): @@ -559,17 +617,17 @@ def log_changes(y): # Then, output the samples dropped for key, value in self.mapping_.items(): - if (diff := self._counts[key] - np.sum(y == value)) > 0: + if (diff := self._counts[key] - np.sum(yt == value)) > 0: self._log(f" --> Removing {diff} samples from class: {key}.", 2) # Add the new samples to the old dataframe - X, y = bk.concat([X, X_new]), bk.concat([y, y_new]) + Xt, yt = pd.concat([Xt, X_new]), pd.concat([yt, y_new]) - return X, y + return self._convert(Xt), self._convert(yt) @beartype -class Cleaner(TransformerMixin, _SetOutputMixin): +class Cleaner(TransformerMixin): """Applies standard data cleaning steps on a dataset. Use the parameters to choose which transformations to perform. @@ -623,24 +681,12 @@ class Cleaner(TransformerMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": - - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -733,27 +779,17 @@ def __init__( self.drop_missing_target = drop_missing_target self.encode_target = encode_target - @composed(crash, method_to_log) - def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor | None = None, y: YConstructor | None = None) -> Self: """Fit to data. Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. - y: int, str, dict, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. Returns ------- @@ -761,7 +797,14 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: Estimator instance. """ + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + self.mapping_: dict[str, Any] = {} + self.target_names_in_ = np.array([]) self._drop_cols = [] self._estimators = {} @@ -770,26 +813,23 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: self._log("Fitting Cleaner...", 1) - if X is not None and self.drop_dtypes is not None: - self._drop_cols = list(X.select_dtypes(include=lst(self.drop_dtypes)).columns) + if Xt is not None and self.drop_dtypes is not None: + self._drop_cols = list(Xt.select_dtypes(include=lst(self.drop_dtypes)).columns) - if y is not None: - if isinstance(y, series_t): - self.target_names_in_ = np.array([y.name]) - else: - self.target_names_in_ = y.columns.to_numpy() + if yt is not None: + self.target_names_in_ = np.array(get_col_names(yt)) if self.drop_chars: - if isinstance(y, series_t): - y.name = re.sub(self.drop_chars, "", str(y.name)) + if isinstance(yt, pd.DataFrame): + yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) else: - y = y.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) + yt.name = re.sub(self.drop_chars, "", str(yt.name)) if self.drop_missing_target: - y = replace_missing(y, self.missing_).dropna(axis=0) + yt = replace_missing(yt, self.missing_).dropna(axis=0) if self.encode_target: - for col in get_cols(y): + for col in get_cols(yt): if isinstance(col.iloc[0], sequence_t): # Multilabel MultiLabelBinarizer = self._get_est_class( name="MultiLabelBinarizer", @@ -799,7 +839,9 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: elif list(uq := np.unique(col)) != list(range(col.nunique())): LabelEncoder = self._get_est_class("LabelEncoder", "preprocessing") self._estimators[col.name] = LabelEncoder().fit(col) - self.mapping_.update({col.name: {str(it(v)): i for i, v in enumerate(uq)}}) + self.mapping_.update( + {str(col.name): {str(it(v)): i for i, v in enumerate(uq)}} + ) return self @@ -829,31 +871,21 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> return np.array(columns) - @composed(crash, method_to_log) def transform( self, - X: DataFrame | None = None, - y: Pandas | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + X: XConstructor | None = None, + y: YConstructor | None = None, + ) -> YReturn | tuple[XReturn, YReturn]: """Apply the data cleaning steps to the data. Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. - y: int, str, dict, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. Returns ------- @@ -864,95 +896,98 @@ def transform( Transformed target column. Only returned if provided. """ + check_is_fitted(self) + + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + yt = to_tabular(y, index=getattr(Xt, "index", None), columns=self.target_names_in_) + self._log("Cleaning the data...", 1) - if X is not None: + if Xt is not None: # Unify all missing values - X = replace_missing(X, self.missing_) + Xt = replace_missing(Xt, self.missing_) - for name, column in X.items(): + for name, column in Xt.items(): # Drop features with an invalid data type if name in self._drop_cols: self._log( f" --> Dropping feature {name} for " f"having type: {column.dtype.name}.", 2, ) - X = X.drop(columns=name) + Xt = Xt.drop(columns=name) elif column.dtype.name in CAT_TYPES: if self.strip_categorical: # Strip strings from blank spaces - X[name] = column.apply( + Xt[name] = column.apply( lambda val: val.strip() if isinstance(val, str) else val ) # Drop prohibited chars from column names if self.drop_chars: - X = X.rename(columns=lambda x: re.sub(self.drop_chars, "", str(x))) + Xt = Xt.rename(columns=lambda x: re.sub(self.drop_chars, "", str(x))) # Drop duplicate samples if self.drop_duplicates: - X = X.drop_duplicates(ignore_index=True) + Xt = Xt.drop_duplicates(ignore_index=True) if self.convert_dtypes: - X = X.convert_dtypes() + Xt = Xt.convert_dtypes() - if y is not None: + if yt is not None: if self.drop_chars: - if isinstance(y, series_t): - y.name = re.sub(self.drop_chars, "", str(y.name)) + if isinstance(y, pd.Series): + yt.name = re.sub(self.drop_chars, "", str(yt.name)) else: - y = y.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) + yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) # Delete samples with missing values in target if self.drop_missing_target: - length = len(y) # Save original length to count deleted rows later - y = replace_missing(y, self.missing_).dropna() + length = len(yt) # Save original length to count deleted rows later + yt = replace_missing(yt, self.missing_).dropna() - if X is not None: - X = X[X.index.isin(y.index)] # Select only indices that remain + if Xt is not None: + Xt = Xt[Xt.index.isin(yt.index)] # Select only indices that remain - if (d := length - len(y)) > 0: + if (d := length - len(yt)) > 0: self._log(f" --> Dropping {d} rows with missing values in target.", 2) if self.encode_target and self._estimators: - yt = y.__class__(dtype="object") - for col in get_cols(y): + y_new = yt.__class__(dtype="object") + for col in get_cols(yt): if est := self._estimators.get(col.name): if n_cols(out := est.transform(col)) == 1: self._log(f" --> Label-encoding column {col.name}.", 2) - out = to_series(out, y.index, col.name) - + out = to_series(out, yt.index, str(col.name)) else: self._log(f" --> Label-binarizing column {col.name}.", 2) out = to_df( data=out, - index=y.index, + index=yt.index, columns=[f"{col.name}_{c}" for c in est.classes_], ) # Replace target with encoded column(s) - if isinstance(y, series_t): - yt = out + if isinstance(yt, pd.Series): + y_new = out else: - yt = merge(yt, out) + y_new = merge(y_new, out) else: # Add unchanged column - yt = merge(yt, col) + y_new = merge(y_new, col) - y = yt + yt = y_new if self.convert_dtypes: - y = y.convert_dtypes() + yt = yt.convert_dtypes() - return variable_return(X, y) + return variable_return(self._convert(Xt), self._convert(yt)) - @composed(crash, method_to_log) def inverse_transform( self, - X: DataFrame | None = None, - y: Pandas | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + X: XConstructor | None = None, + y: YConstructor | None = None, + ) -> YReturn | tuple[XReturn, YReturn]: """Inversely transform the label encoding. This method only inversely transforms the target encoding. @@ -964,17 +999,8 @@ def inverse_transform( X: dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. - y: int, str, dict, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. Returns ------- @@ -985,38 +1011,43 @@ def inverse_transform( Original target column. Only returned if provided. """ + check_is_fitted(self) + + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + self._log("Inversely cleaning the data...", 1) - if y is not None and self._estimators: - yt = y.__class__(dtype="object") + if yt is not None and self._estimators: + y_new = yt.__class__(dtype="object") for col in self.target_names_in_: if est := self._estimators.get(col): if est.__class__.__name__ == "LabelEncoder": self._log(f" --> Inversely label-encoding column {col}.", 2) - out = est.inverse_transform(bk.DataFrame(y)[col]) + out = est.inverse_transform(pd.DataFrame(yt)[col]) - elif isinstance(y, dataframe_t): + elif isinstance(yt, pd.DataFrame): self._log(f" --> Inversely label-binarizing column {col}.", 2) out = est.inverse_transform( - y.loc[:, y.columns.str.startswith(f"{col}_")].to_numpy() + yt.loc[:, yt.columns.str.startswith(f"{col}_")].to_numpy() ) # Replace encoded columns with target column - if isinstance(y, series_t): - yt = to_series(out, y.index, col) + if isinstance(yt, pd.Series): + y_new = to_series(out, yt.index, col) else: - yt = merge(yt, to_series(out, y.index, col)) + y_new = merge(y_new, to_series(out, yt.index, col)) else: # Add unchanged column - yt = merge(yt, bk.DataFrame(y)[col]) + y_new = merge(y_new, pd.DataFrame(yt)[col]) - y = yt + yt = y_new - return variable_return(X, y) + return variable_return(self._convert(Xt), self._convert(yt)) @beartype -class Decomposer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Decomposer(TransformerMixin, OneToOneFeatureMixin): """Detrend and deseasonalize the time series. This class does two things: @@ -1155,8 +1186,7 @@ def __init__( self.sp = sp self.seasonal_model = seasonal_model - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -1164,7 +1194,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1175,13 +1205,17 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """ from atom.models import MODELS + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + if isinstance(self.model, str): if self.model in MODELS: model = MODELS[self.model]( goal=Goal.forecast, **{x: getattr(self, x) for x in BaseTransformer.attrs if hasattr(self, x)}, ) - model.task = Goal.forecast.infer_task(y) forecaster = model._get_est({}) else: raise ValueError( @@ -1203,7 +1237,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: self._log("Fitting Decomposer...", 1) self._estimators: dict[Hashable, tuple[Transformer, Transformer]] = {} - for name, column in X.select_dtypes(include="number").items(): + for name, column in Xt.select_dtypes(include="number").items(): trend = Detrender( forecaster=forecaster, model=self.trend_model, @@ -1224,8 +1258,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Decompose the data. Parameters @@ -1233,7 +1266,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1242,15 +1275,18 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Decomposing the data...", 1) for col, (trend, season) in self._estimators.items(): - X[col] = season.transform(trend.transform(X[col])) + Xt[col] = season.transform(trend.transform(Xt[col])) - return X + return self._convert(Xt) - @composed(crash, method_to_log) - def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Inversely transform the data. Parameters @@ -1258,7 +1294,7 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1267,16 +1303,20 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Original feature set. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Inversely decomposing the data...", 1) for col, (trend, season) in self._estimators.items(): - X[col] = trend.inverse_transform(season.inverse_transform(X[col])) + Xt[col] = trend.inverse_transform(season.inverse_transform(Xt[col])) - return X + return self._convert(Xt) @beartype -class Discretizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Discretizer(TransformerMixin, OneToOneFeatureMixin): """Bin continuous data into intervals. For each feature, the bin edges are computed during fit and, @@ -1334,24 +1374,12 @@ class Discretizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "estimator": - - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -1444,8 +1472,7 @@ def __init__( self.bins = bins self.labels = labels - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -1453,7 +1480,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1501,16 +1528,21 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: return labels - self._estimators: dict[str, Estimator] = {} - self._labels: dict[str, Sequence[str]] = {} + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + + self._estimators: dict[Hashable, Estimator] = {} + self._labels: dict[Hashable, Sequence[str]] = {} self._log("Fitting Discretizer...", 1) - for i, col in enumerate(X.select_dtypes(include="number")): + for i, col in enumerate(Xt.select_dtypes(include="number")): # Assign bins per column if isinstance(self.bins, dict): if col in self.bins: - bins_c = self.bins[col] + bins_c = self.bins[str(col)] else: continue # Ignore existing column not specified in dict else: @@ -1524,7 +1556,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: raise ValueError( "Invalid value for the bins parameter. The length of the " "bins does not match the length of the columns, got len" - f"(bins)={len(bins_c)} and len(columns)={X.shape[1]}." + f"(bins)={len(bins_c)} and len(columns)={Xt.shape[1]}." ) from None else: bins_x = bins_c @@ -1542,11 +1574,11 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: encode="ordinal", strategy=self.strategy, **kwargs, - ).fit(X[[col]]) + ).fit(Xt[[col]]) # Save labels for transform method self._labels[col] = get_labels( - col=col, + col=str(col), bins=self._estimators[col].bin_edges_[0], ) @@ -1566,14 +1598,13 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: # Make of cut a transformer self._estimators[col] = FunctionTransformer( - func=bk.cut, - kw_args={"bins": bins_c, "labels": get_labels(col, bins_c)}, - ).fit(X[[col]]) + func=pd.cut, + kw_args={"bins": bins_c, "labels": get_labels(str(col), bins_c)}, + ).fit(Xt[[col]]) return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Bin the data into intervals. Parameters @@ -1581,7 +1612,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1590,25 +1621,29 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Binning the features...", 1) for col in self._estimators: if self.strategy == "custom": - X[col] = self._estimators[col].transform(X[col]) + Xt[col] = self._estimators[col].transform(Xt[col]) else: - X[col] = self._estimators[col].transform(X[[col]]).iloc[:, 0] + Xt[col] = self._estimators[col].transform(Xt[[col]]).iloc[:, 0] # Replace cluster values with labels for i, label in enumerate(self._labels[col]): - X[col] = X[col].replace(i, label) + Xt[col] = Xt[col].replace(i, label) - self._log(f" --> Discretizing feature {col} in {X[col].nunique()} bins.", 2) + self._log(f" --> Discretizing feature {col} in {Xt[col].nunique()} bins.", 2) - return X + return self._convert(Xt) @beartype -class Encoder(TransformerMixin, _SetOutputMixin): +class Encoder(TransformerMixin): """Perform encoding of categorical features. The encoding type depends on the number of classes in the column: @@ -1761,8 +1796,7 @@ def __init__( self.value = value self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Note that leaving y=None can lead to errors if the `strategy` @@ -1774,17 +1808,8 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence or dataframe-like - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. + y: sequence or dataframe-like + Target column(s) corresponding to `X`. Returns ------- @@ -1811,6 +1836,12 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: "woe": WOEEncoder, } + Xt = to_df(X) + yt = to_tabular(y, index=Xt.index) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + if isinstance(self.strategy, str): if self.strategy.lower().endswith("encoder"): self.strategy = self.strategy[:-7] # Remove 'Encoder' at the end @@ -1835,7 +1866,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: if self.infrequent_to_value: if self.infrequent_to_value < 1: - infrequent_to_value = int(self.infrequent_to_value * len(X)) + infrequent_to_value = int(self.infrequent_to_value * len(Xt)) else: infrequent_to_value = int(self.infrequent_to_value) @@ -1843,12 +1874,12 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: encoders: dict[str, list[str]] = defaultdict(list) - for name, column in X.select_dtypes(include=CAT_TYPES).items(): + for name, column in Xt.select_dtypes(include=CAT_TYPES).items(): # Replace infrequent classes with the string in `value` if self.infrequent_to_value: values = column.value_counts() self._to_value[name] = values[values <= infrequent_to_value].index.tolist() - X[name] = column.replace(self._to_value[name], self.value) + Xt[name] = column.replace(self._to_value[name], self.value) # Get the unique categories before fitting self._categories[name] = column.dropna().sort_values().unique().tolist() @@ -1862,8 +1893,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: self._log( f" --> The number of classes passed to feature {name} in the " f"ordinal parameter ({len(ordinal_c)}) don't match the number " - f"of classes in the data ({column.nunique(dropna=True)}).", - 1, + f"of classes in the data ({column.nunique(dropna=True)}).", 1, severity="warning", ) @@ -1908,7 +1938,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: remainder="passthrough", n_jobs=self.n_jobs, verbose_feature_names_out=False, - ).fit(X, y) + ).fit(Xt, yt) return self @@ -1935,8 +1965,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> return get_col_order(cols, self.feature_names_in_, self._estimator.feature_names_in_) - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Encode the data. Parameters @@ -1944,7 +1973,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1953,10 +1982,14 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Encoded dataframe. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Encoding categorical columns...", 1) # Convert infrequent classes to value - X = X.replace(self._to_value, self.value) + Xt = Xt.replace(self._to_value, self.value) for name, categories in self._categories.items(): if name in self._estimator.transformers_[0][2]: @@ -1968,24 +2001,24 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: self._log( f" --> {estimator.__class__.__name__[:-7]}-encoding feature " - f"{name}. Contains {X[name].nunique()} classes.", 2, + f"{name}. Contains {Xt[name].nunique()} classes.", 2, ) # Count the propagated missing values - if n_nans := X[name].isna().sum(): + if n_nans := Xt[name].isna().sum(): self._log(f" --> Propagating {n_nans} missing values.", 2) # Check for unknown classes - if uc := len(X[name].dropna()[~X[name].isin(categories)]): + if uc := len(Xt[name].dropna()[~Xt[name].isin(categories)]): self._log(f" --> Handling {uc} unknown classes.", 2) - Xt = self._estimator.transform(X) + Xt = self._estimator.transform(Xt) - return Xt[self.get_feature_names_out()] + return self._convert(Xt[self.get_feature_names_out()]) @beartype -class Imputer(TransformerMixin, _SetOutputMixin): +class Imputer(TransformerMixin): """Handle missing values in the data. Impute or remove missing values according to the selected strategy. @@ -1999,7 +2032,7 @@ class Imputer(TransformerMixin, _SetOutputMixin): Parameters ---------- - strat_num: str, int or float, default="drop" + strat_num: str, int or float, default="mean" Imputing strategy for numerical columns. Choose from: - "drop": Drop rows containing missing values. @@ -2019,7 +2052,7 @@ class Imputer(TransformerMixin, _SetOutputMixin): of column. - int or float: Impute with provided numerical value. - strat_cat: str, default="drop" + strat_cat: str, default="most_frequent" Imputing strategy for categorical columns. Choose from: - "drop": Drop rows containing missing values. @@ -2047,24 +2080,12 @@ class Imputer(TransformerMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": - - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -2145,8 +2166,8 @@ class Imputer(TransformerMixin, _SetOutputMixin): def __init__( self, - strat_num: Scalar | NumericalStrats = "drop", - strat_cat: str | CategoricalStrats = "drop", + strat_num: Scalar | NumericalStrats = "mean", + strat_cat: str | CategoricalStrats = "most_frequent", *, max_nan_rows: FloatLargerZero | None = None, max_nan_cols: FloatLargerZero | None = None, @@ -2168,8 +2189,7 @@ def __init__( self.max_nan_rows = max_nan_rows self.max_nan_cols = max_nan_cols - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -2177,7 +2197,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -2186,22 +2206,27 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: Estimator instance. """ + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + if not hasattr(self, "missing_"): self.missing_ = DEFAULT_MISSING self._log("Fitting Imputer...", 1) # Unify all values to impute - X = replace_missing(X, self.missing_) + Xt = replace_missing(Xt, self.missing_) if self.max_nan_rows is not None: if self.max_nan_rows <= 1: - self._max_nan_rows = int(X.shape[1] * self.max_nan_rows) + self._max_nan_rows = int(Xt.shape[1] * self.max_nan_rows) else: self._max_nan_rows = int(self.max_nan_rows) - X = X.dropna(axis=0, thresh=X.shape[1] - self._max_nan_rows) - if X.empty: + Xt = Xt.dropna(axis=0, thresh=Xt.shape[1] - self._max_nan_rows) + if Xt.empty: raise ValueError( "Invalid value for the max_nan_rows parameter, got " f"{self.max_nan_rows}. All rows contain more than " @@ -2211,11 +2236,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: if self.max_nan_cols is not None: if self.max_nan_cols <= 1: - max_nan_cols = int(X.shape[0] * self.max_nan_cols) + max_nan_cols = int(Xt.shape[0] * self.max_nan_cols) else: max_nan_cols = int(self.max_nan_cols) - X = X.drop(columns=X.columns[X.isna().sum() > max_nan_cols]) + Xt = Xt.drop(columns=Xt.columns[Xt.isna().sum() > max_nan_cols]) # Load the imputer class from sklearn or cuml (note the different modules) SimpleImputer = self._get_est_class( @@ -2235,7 +2260,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: elif self.strat_num == "drop": num_imputer = "passthrough" else: - num_imputer = make_sklearn(sktimeImputer)( + num_imputer = make_sklearn(SktimeImputer)( method=self.strat_num, missing_values=[pd.NA], random_state=self.random_state, @@ -2263,13 +2288,13 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: self._estimator = ColumnTransformer( transformers=[ - ("num_imputer", num_imputer, list(X.select_dtypes(include="number"))), - ("cat_imputer", cat_imputer, list(X.select_dtypes(include=CAT_TYPES))), + ("num_imputer", num_imputer, list(Xt.select_dtypes(include="number"))), + ("cat_imputer", cat_imputer, list(Xt.select_dtypes(include=CAT_TYPES))), ], remainder="passthrough", n_jobs=self.n_jobs, verbose_feature_names_out=False, - ).fit(X) + ).fit(Xt) return self @@ -2295,12 +2320,11 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> [c for c in self.feature_names_in_ if c in self._estimator.get_feature_names_out()] ) - @composed(crash, method_to_log) def transform( self, - X: DataFrame, - y: Pandas | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + X: XConstructor, + y: YConstructor | None = None, + ) -> YReturn | tuple[XReturn, YReturn]: """Impute the missing values. Note that leaving y=None can lead to inconsistencies in @@ -2312,17 +2336,8 @@ def transform( X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. Returns ------- @@ -2333,6 +2348,11 @@ def transform( Transformed target column. Only returned if provided. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + yt = to_tabular(y, index=Xt.index) + num_imputer = self._estimator.named_transformers_["num_imputer"] cat_imputer = self._estimator.named_transformers_["cat_imputer"] @@ -2341,52 +2361,49 @@ def transform( self._log("Imputing missing values...", 1) # Unify all values to impute - X = replace_missing(X, self.missing_) + Xt = replace_missing(Xt, self.missing_) # Drop rows with too many missing values if self.max_nan_rows is not None: - length = len(X) - X = X.dropna(axis=0, thresh=X.shape[1] - self._max_nan_rows) - if diff := length - len(X): + length = len(Xt) + Xt = Xt.dropna(axis=0, thresh=Xt.shape[1] - self._max_nan_rows) + if diff := length - len(Xt): self._log( f" --> Dropping {diff} samples for containing more " - f"than {self._max_nan_rows} missing values.", - 2, + f"than {self._max_nan_rows} missing values.", 2, ) if self.strat_num == "drop": - length = len(X) - X = X.dropna(subset=self._estimator.transformers_[0][2]) - if diff := length - len(X): + length = len(Xt) + Xt = Xt.dropna(subset=self._estimator.transformers_[0][2]) + if diff := length - len(Xt): self._log( f" --> Dropping {diff} samples for containing " - f"missing values in numerical columns.", - 2, + f"missing values in numerical columns.", 2, ) if self.strat_cat == "drop": - length = len(X) - X = X.dropna(subset=self._estimator.transformers_[1][2]) - if diff := length - len(X): + length = len(Xt) + Xt = Xt.dropna(subset=self._estimator.transformers_[1][2]) + if diff := length - len(Xt): self._log( f" --> Dropping {diff} samples for containing " - f"missing values in categorical columns.", - 2, + f"missing values in categorical columns.", 2, ) # Print imputation information per feature - for name, column in X.items(): + for name, column in Xt.items(): if nans := column.isna().sum(): # Drop columns with too many missing values if name not in self._estimator.feature_names_in_: self._log( f" --> Dropping feature {name}. Contains {nans} " - f"({nans * 100 // len(X)}%) missing values.", 2, + f"({nans * 100 // len(Xt)}%) missing values.", 2, ) - X = X.drop(columns=name) + Xt = Xt.drop(columns=name) continue - if self.strat_num != "drop" and name in num_imputer.feature_names_in_: + if name in getattr(num_imputer, "feature_names_in_", []): if not isinstance(self.strat_num, str): self._log( f" --> Imputing {nans} missing values with " @@ -2400,15 +2417,14 @@ def transform( elif self.strat_num in ("mean", "median", "most_frequent"): self._log( f" --> Imputing {nans} missing values with {self.strat_num} " - f"({np.round(get_stat(num_imputer, name), 2)}) in column " - f"{name}.", 2, + f"({np.round(get_stat(num_imputer, name), 2)}) in column {name}.", 2, ) else: self._log( f" --> Imputing {nans} missing values with {self.strat_num} " f"in column {name}.", 2, ) - elif self.strat_cat != "drop" and name in cat_imputer.feature_names_in_: + elif name in getattr(cat_imputer, "feature_names_in_", []): if self.strat_cat == "most_frequent": self._log( f" --> Imputing {nans} missing values with most_frequent " @@ -2420,20 +2436,20 @@ def transform( f"'{self.strat_cat}' in column {name}.", 2, ) - Xt = self._estimator.transform(X) + Xt = self._estimator.transform(Xt) # Make y consistent with X - if y is not None: - y = y[y.index.isin(Xt.index)] + if yt is not None: + yt = yt[yt.index.isin(Xt.index)] # Reorder columns to original order Xt = Xt[self.get_feature_names_out()] - return variable_return(Xt, y) + return variable_return(self._convert(Xt), self._convert(yt)) @beartype -class Normalizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Normalizer(TransformerMixin, OneToOneFeatureMixin): """Transform the data to follow a Normal/Gaussian distribution. This transformation is useful for modeling issues related to @@ -2470,24 +2486,12 @@ class Normalizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "estimator": - - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -2575,8 +2579,7 @@ def __init__( self.strategy = strategy self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -2584,7 +2587,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -2599,6 +2602,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: "quantile": "QuantileTransformer", } + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + if self.strategy in ("yeojohnson", "boxcox"): estimator = self._get_est_class(strategies[self.strategy], "preprocessing") self._estimator = estimator( @@ -2619,7 +2627,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: f"Choose from: {', '.join(strategies)}." ) - num_cols = X.select_dtypes(include="number") + num_cols = Xt.select_dtypes(include="number") if num_cols.empty: raise ValueError( @@ -2635,8 +2643,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Apply the transformations to the data. Parameters @@ -2644,7 +2651,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -2653,15 +2660,17 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Normalized dataframe. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Normalizing features...", 1) - Xt = self._estimator.transform(X[self._estimator.feature_names_in_]) - X.update(Xt) + Xt.update(self._estimator.transform(Xt[self._estimator.feature_names_in_])) - return X[self.feature_names_in_] + return self._convert(Xt) - @composed(crash, method_to_log) - def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Apply the inverse transformation to the data. Parameters @@ -2669,7 +2678,7 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -2678,17 +2687,21 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Original dataframe. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Inversely normalizing features...", 1) - Xt = self._estimator.inverse_transform(X[self._estimator.feature_names_in_]) - Xt = to_df(Xt, index=X.index, columns=self._estimator.feature_names_in_) - X.update(Xt) + out: np.ndarray = self._estimator.inverse_transform(Xt[self._estimator.feature_names_in_]) + + Xt.update(to_df(out, index=Xt.index, columns=self._estimator.feature_names_in_)) - return X + return self._convert(Xt) @beartype -class Pruner(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Pruner(TransformerMixin, OneToOneFeatureMixin): """Prune outliers from the data. Replace or remove outliers. The definition of outlier depends @@ -2743,25 +2756,12 @@ class Pruner(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "estimator": - - - "sklearn" (default) - - "sklearnex" - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -2850,12 +2850,11 @@ def __init__( self.include_target = include_target self.kwargs = kwargs - @composed(crash, method_to_log) def transform( self, - X: DataFrame, - y: Pandas | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + X: XConstructor, + y: YConstructor | None = None, + ) -> YReturn | tuple[XReturn, YReturn]: """Apply the outlier strategy on the data. Parameters @@ -2863,17 +2862,8 @@ def transform( X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. Returns ------- @@ -2884,6 +2874,9 @@ def transform( Transformed target column. Only returned if provided. """ + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + yt = to_tabular(y, index=Xt.index) + # Estimators with their modules strategies = { "iforest": ["IsolationForest", "ensemble"], @@ -2922,7 +2915,7 @@ def transform( self._log("Pruning outliers...", 1) # Prepare dataset (merge with y and exclude categorical columns) - objective = merge(X, y) if self.include_target and y is not None else X + objective = merge(Xt, yt) if self.include_target and yt is not None else Xt objective = objective.select_dtypes(include=["number"]) outliers = [] @@ -2984,27 +2977,27 @@ def transform( if outliers: # Select outliers from intersection of strategies - mask = [any(strats) for strats in zip(*outliers, strict=True)] - self._log(f" --> Dropping {len(mask) - sum(mask)} outliers.", 2) + outlier_rows = [any(strats) for strats in zip(*outliers, strict=True)] + self._log(f" --> Dropping {len(outlier_rows) - sum(outlier_rows)} outliers.", 2) # Keep only the non-outliers from the data - X = X[mask] - if y is not None: - y = y[mask] + Xt = Xt[outlier_rows] + if yt is not None: + yt = yt[outlier_rows] else: # Replace the columns in X and y with the new values from objective - X.update(objective) - if isinstance(y, series_t) and y.name in objective: - y.update(objective[str(y.name)]) - elif isinstance(y, dataframe_t): - y.update(objective) + Xt.update(objective) + if isinstance(yt, pd.Series) and yt.name in objective: + yt.update(objective[str(yt.name)]) + elif isinstance(yt, pd.DataFrame): + yt.update(objective) - return variable_return(X, y) + return variable_return(self._convert(Xt), self._convert(yt)) @beartype -class Scaler(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Scaler(TransformerMixin, OneToOneFeatureMixin): """Scale the data. Apply one of sklearn's scaling strategies. Categorical columns @@ -3033,24 +3026,12 @@ class Scaler(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": - - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -3128,8 +3109,7 @@ def __init__( self.include_binary = include_binary self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -3137,7 +3117,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -3153,10 +3133,15 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: "robust": "RobustScaler", } - num_cols = X.select_dtypes(include="number") + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + + num_cols = Xt.select_dtypes(include="number") if not self.include_binary: - num_cols = X[[n for n, c in num_cols.items() if ~np.isin(c.unique(), [0, 1]).all()]] + num_cols = Xt[[n for n, c in num_cols.items() if ~np.isin(c.unique(), [0, 1]).all()]] if num_cols.empty: raise ValueError( @@ -3165,19 +3150,17 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: "non-binary columns when include_binary=False." ) - estimator = self._get_est_class(strategies[self.strategy], "preprocessing") - self._estimator = estimator(**self.kwargs) - self._log("Fitting Scaler...", 1) - self._estimator.fit(num_cols) + + estimator = self._get_est_class(strategies[self.strategy], "preprocessing") + self._estimator = estimator(**self.kwargs).fit(num_cols) # Add the estimator as attribute to the instance setattr(self, f"{self.strategy}_", self._estimator) return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Perform standardization by centering and scaling. Parameters @@ -3185,7 +3168,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -3194,15 +3177,17 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Scaled dataframe. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Scaling features...", 1) - Xt = self._estimator.transform(X[self._estimator.feature_names_in_]) - X.update(Xt) + Xt.update(self._estimator.transform(Xt[self._estimator.feature_names_in_])) - return X + return self._convert(Xt) - @composed(crash, method_to_log) - def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Apply the inverse transformation to the data. Parameters @@ -3210,7 +3195,7 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -3219,10 +3204,14 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Scaled dataframe. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Inversely scaling features...", 1) - Xt = self._estimator.inverse_transform(X[self._estimator.feature_names_in_]) - Xt = to_df(Xt, index=X.index, columns=self._estimator.feature_names_in_) - X.update(Xt) + out: np.ndarray = self._estimator.inverse_transform(Xt[self._estimator.feature_names_in_]) + + Xt.update(to_df(out, index=Xt.index, columns=self._estimator.feature_names_in_)) - return X + return self._convert(Xt) diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index 16e745f84..430d7fe71 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -9,10 +9,9 @@ from collections.abc import Hashable from random import sample -from typing import Any, Literal +from typing import Any, Literal, cast import featuretools as ft -import joblib import numpy as np import pandas as pd from beartype import beartype @@ -34,14 +33,14 @@ from atom.basetransformer import BaseTransformer from atom.data_cleaning import Scaler, TransformerMixin from atom.utils.types import ( - Backend, Bool, DataFrame, Engine, FeatureSelectionSolvers, - FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, - FloatZeroToOneInc, IntLargerEqualZero, IntLargerZero, NJobs, Operators, - Pandas, Scalar, Sequence, Series, Verbose, series_t, + Bool, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, + FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, + IntLargerEqualZero, IntLargerZero, NJobs, Operators, Scalar, Sequence, + Verbose, XConstructor, XReturn, YConstructor, ) from atom.utils.utils import ( - Goal, Task, bk, check_is_fitted, check_scaling, composed, crash, - get_custom_scorer, is_sparse, lst, merge, method_to_log, sign, + Goal, Task, check_is_fitted, check_scaling, get_custom_scorer, is_sparse, + lst, merge, sign, to_df, to_tabular, ) @@ -173,8 +172,7 @@ def __init__( self.drop_columns = drop_columns self.from_index = from_index - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Extract the new features. Parameters @@ -182,7 +180,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -191,24 +189,26 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + self._log("Extracting datetime features...", 1) if self.from_index: - if hasattr(X.index, "to_timestamp"): - Xc = bk.DataFrame(X.index.to_timestamp()) - order = Xc.columns.tolist() + X.columns.tolist() + if hasattr(Xt.index, "to_timestamp"): + Xc = pd.DataFrame(Xt.index.to_timestamp()) + order = Xc.columns.tolist() + Xt.columns.tolist() else: raise ValueError("Unable to convert the index to a timestamp format.") else: - Xc = X.select_dtypes(exclude="number") - order = X.columns.tolist() + Xc = Xt.select_dtypes(exclude="number") + order = Xt.columns.tolist() - Xt = bk.DataFrame(index=X.index) + X_new = pd.DataFrame(index=Xt.index) for name, column in Xc.items(): col_dt = pd.to_datetime( arg=column, errors="coerce", # Converts to NaT if he can't format - format=self.fmt.get(name) if isinstance(self.fmt, dict) else self.fmt, + format=self.fmt.get(str(name)) if isinstance(self.fmt, dict) else self.fmt, ) # If >30% values are NaT, the conversion was unsuccessful @@ -228,7 +228,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: f"{fx.lower()} is not an attribute of pd.Series.dt." ) - if not isinstance(series, series_t): + if not isinstance(series, pd.Series): self._log( f" --> Extracting feature {fx} " "failed. Result is not a Series.dt.", 2, @@ -238,7 +238,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: continue # Skip if the resulting feature has zero variance min_val: int = 0 - max_val: Scalar | Series | None = None # None if isn't cyclic + max_val: Scalar | pd.Series | None = None # None if isn't cyclic if self.encoding_type == "cyclic": if fx == "microsecond": min_val, max_val = 0, 1e6 - 1 @@ -252,7 +252,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: min_val, max_val = 1, col_dt.dt.daysinmonth elif fx in ("dayofyear", "day_of_year"): min_val = 1 - max_val = [365 if i else 366 for i in col_dt.dt.is_leap_year] + max_val = pd.Series([365 if i else 366 for i in col_dt.dt.is_leap_year]) elif fx == "month": min_val, max_val = 1, 12 elif fx == "quarter": @@ -261,21 +261,21 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: new_name = f"{name}_{fx}" if self.encoding_type == "ordinal" or max_val is None: self._log(f" --> Creating feature {new_name}.", 2) - Xt[new_name] = series.to_numpy() - order.insert(order.index(name) + 1, new_name) + X_new[new_name] = series.to_numpy() + order.insert(order.index(str(name)) + 1, new_name) elif self.encoding_type == "cyclic": self._log(f" --> Creating cyclic feature {new_name}.", 2) pos = 2 * np.pi * (series.to_numpy() - min_val) / np.array(max_val) - Xt[f"{new_name}_sin"] = np.sin(pos) - Xt[f"{new_name}_cos"] = np.cos(pos) - order.insert(order.index(name) + 1, f"{new_name}_sin") - order.insert(order.index(name) + 2, f"{new_name}_cos") + X_new[f"{new_name}_sin"] = np.sin(pos) + X_new[f"{new_name}_cos"] = np.cos(pos) + order.insert(order.index(str(name)) + 1, f"{new_name}_sin") + order.insert(order.index(str(name)) + 2, f"{new_name}_cos") # Drop the original column if self.drop_columns or self.from_index: - order.remove(name) + order.remove(str(name)) - return merge(Xt, X)[order] + return self._convert(merge(X_new, Xt)[order]) @beartype @@ -420,8 +420,7 @@ def __init__( self.operators = operators self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -429,18 +428,8 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe-like: Target columns with shape=(n_samples, - n_targets) for multioutput tasks. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. Returns ------- @@ -448,6 +437,12 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: Estimator instance. """ + Xt = to_df(X) + yt = to_tabular(y, index=Xt.index) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + all_operators = { "add": "add_numeric", "sub": "subtract_numeric", @@ -470,7 +465,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: if self.strategy == "dfs": # Run deep feature synthesis with transformation primitives - es = ft.EntitySet(dataframes={"X": (X, "_index", None, None, None, True)}) + es = ft.EntitySet(dataframes={"X": (Xt, "_index", None, None, None, True)}) self._dfs = ft.dfs( target_dataframe_name="X", entityset=es, @@ -481,7 +476,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: ) # Select the new features (dfs also returns originals) - self._dfs = self._dfs[X.shape[1] - 1:] + self._dfs = self._dfs[Xt.shape[1] - 1:] # Get a random selection of features if self.n_features and self.n_features < len(self._dfs): @@ -500,17 +495,16 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: init_depth=kwargs.pop("init_depth", (1, 2)), const_range=kwargs.pop("const_range", None), function_set=operators, - feature_names=X.columns, + feature_names=Xt.columns, verbose=kwargs.pop("verbose", 0 if self.verbose < 2 else 1), n_jobs=kwargs.pop("n_jobs", self.n_jobs), random_state=kwargs.pop("random_state", self.random_state), **kwargs, - ).fit(X, y) + ).fit(Xt, yt) return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Generate new features. Parameters @@ -518,7 +512,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -527,18 +521,18 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Generating new features...", 1) if self.strategy == "dfs": - es = ft.EntitySet(dataframes={"X": (X, "index", None, None, None, True)}) - dfs = ft.calculate_feature_matrix( - features=self._dfs, - entityset=es, - n_jobs=self.n_jobs, - ) + es = ft.EntitySet(dataframes={"X": (Xt, "index", None, None, None, True)}) + dfs = ft.calculate_feature_matrix(self._dfs, entityset=es, n_jobs=self.n_jobs) # Add the new features to the feature set - X = pd.concat([X, dfs], axis=1).set_index("index") + Xt = pd.concat([Xt, dfs], axis=1).set_index("index") self._log(f" --> {len(self._dfs)} new features were added.", 2) @@ -548,7 +542,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: data=[ ["", str(fx), fx.fitness_] for i, fx in enumerate(self.gfg_) - if str(fx) not in X.columns + if str(fx) not in Xt.columns ], columns=["name", "description", "fitness"], ) @@ -556,7 +550,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: # Check if any new features remain if len(df) == 0: self._log(" --> The genetic algorithm didn't find any improving features.", 2) - return X + return Xt # Select the n_features with the highest fitness df = df.drop_duplicates() @@ -566,17 +560,16 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: if len(df) != self.n_features: self._log( f" --> Dropping {(self.n_features or len(self.gfg_)) - len(df)} " - "features due to repetition.", - 2, + "features due to repetition.", 2, ) - for i, array in enumerate(self.gfg_.transform(X)[:, df.index].T): + for i, array in enumerate(self.gfg_.transform(Xt)[:, df.index].T): # If the column is new, use a default name counter = 0 while True: - name = f"x{X.shape[1] + counter}" - if name not in X: - X[name] = array # Add new feature to X + name = f"x{Xt.shape[1] + counter}" + if name not in Xt: + Xt[name] = array # Add new feature to X df.iloc[i, 0] = name break else: @@ -585,7 +578,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: self._log(f" --> {len(df)} new features were added.", 2) self.genetic_features_ = df.reset_index(drop=True) - return X + return self._convert(Xt) @beartype @@ -681,8 +674,7 @@ def __init__( self.operators = operators self.drop_columns = drop_columns - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Group features. Parameters @@ -690,7 +682,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -699,6 +691,8 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + self._log("Grouping features...", 1) if self.operators is None: @@ -710,10 +704,10 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: for name, group in self.groups.items(): for operator in operators: try: - result = X[group].apply(getattr(np, operator), axis=1) + result = Xt[group].apply(getattr(np, operator), axis=1) except AttributeError: try: - result = getattr(stats, operator)(X[group], axis=1)[0] + result = getattr(stats, operator)(Xt[group], axis=1)[0] except AttributeError: raise ValueError( "Invalid value for the operators parameter. Value " @@ -721,7 +715,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: ) from None try: - X[f"{operator}({name})"] = result + Xt[f"{operator}({name})"] = result except ValueError: raise ValueError( "Invalid value for the operators parameter. Value " @@ -732,9 +726,9 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: self._log(f" --> Group {name} successfully created.", 2) if self.drop_columns: - X = X.drop(columns=to_drop) + Xt = Xt.drop(columns=to_drop) - return X + return self._convert(Xt) @beartype @@ -901,35 +895,12 @@ class FeatureSelector(TransformerMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "sklearn" (default) - - "sklearnex" - - "cuml" - - backend: str, default="loky" - Parallelization backend. Read more in the - [user guide][parallel-execution]. Choose from: - - - "loky": Single-node, process-based parallelism. - - "multiprocessing": Legacy single-node, process-based - parallelism. Less robust than `loky`. - - "threading": Single-node, thread-based parallelism. - - "ray": Multi-node, process-based parallelism. + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -1015,7 +986,6 @@ def __init__( n_jobs: NJobs = 1, device: str = "cpu", engine: Engine = None, - backend: Backend = "loky", verbose: Verbose = 0, random_state: IntLargerEqualZero | None = None, **kwargs, @@ -1024,7 +994,6 @@ def __init__( n_jobs=n_jobs, device=device, engine=engine, - backend=backend, verbose=verbose, random_state=random_state, ) @@ -1036,8 +1005,7 @@ def __init__( self.max_correlation = max_correlation self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit the feature selector to the data. The univariate, sfm (when model is not fitted), sfs, rfe and @@ -1049,18 +1017,8 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe-like: Target columns with shape=(n_samples, - n_targets) for multioutput tasks. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. Returns ------- @@ -1070,14 +1028,6 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """ from atom.models import MODELS - def check_y(): - """For some strategies, y needs to be provided.""" - if y is None: - raise ValueError( - "Invalid value for the y parameter. Value cannot " - f"be None for strategy='{self.strategy}'." - ) - def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): """Objective function for the advanced optimization strategies.""" if X_train.equals(X_valid): @@ -1087,6 +1037,18 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): model.fit(X_train, y_train) return scoring(model, X_valid, y_valid) + Xt = to_df(X) + yt = to_tabular(y, index=Xt.index) + + if yt is None and self.strategy not in ("pca", "sfm", None): + raise ValueError( + "Invalid value for the y parameter. Value cannot " + f"be None for strategy='{self.strategy}'." + ) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + self.collinear_ = pd.DataFrame(columns=["drop", "corr_feature", "corr_value"]) self.scaler_ = None @@ -1094,21 +1056,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): self._high_variance: dict[Hashable, tuple[Hashable, int]] = {} self._low_variance: dict[Hashable, tuple[Hashable, float]] = {} self._estimator: Any = None - self._n_features = None - - strategies = { - "univariate": "SelectKBest", - "pca": "PCA", - "sfm": "SelectFromModel", - "sfs": "SequentialFeatureSelector", - "rfe": "RFE", - "rfecv": "RFECV", - "pso": ParticleSwarmOptimization, - "hho": HarrisHawkOptimization, - "gwo": GreyWolfOptimization, - "dfo": DragonFlyOptimization, - "go": GeneticOptimization, - } + self._n_features: int | None = None if isinstance(self.strategy, str): if self.strategy not in ("univariate", "pca"): @@ -1144,7 +1092,8 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): if hasattr(self, x) }, ) - model.task = goal.infer_task(y) + if yt is not None: + model.task = goal.infer_task(yt) solver = model._get_est({}) else: raise ValueError( @@ -1171,25 +1120,25 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ) if self.n_features is None: - self._n_features = X.shape[1] + self._n_features = Xt.shape[1] elif self.n_features < 1: - self._n_features = int(self.n_features * X.shape[1]) + self._n_features = int(self.n_features * Xt.shape[1]) else: - self._n_features = self.n_features + self._n_features = int(self.n_features) min_repeated: Scalar if self.min_repeated is None: min_repeated = 1 elif self.min_repeated <= 1: - min_repeated = self.min_repeated * len(X) + min_repeated = self.min_repeated * len(Xt) else: min_repeated = int(self.min_repeated) max_repeated: Scalar if self.max_repeated is None: - max_repeated = len(X) + max_repeated = len(Xt) elif self.max_repeated <= 1: - max_repeated = self.max_repeated * len(X) + max_repeated = self.max_repeated * len(Xt) else: max_repeated = int(self.max_repeated) @@ -1203,30 +1152,30 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): # Remove features with too high variance if self.min_repeated is not None: - for name, column in X.select_dtypes(exclude="number").items(): + for name, column in Xt.select_dtypes(exclude="number").items(): max_counts = column.value_counts() if min_repeated > max_counts.max(): self._high_variance[name] = (max_counts.idxmax(), max_counts.max()) - X = X.drop(columns=name) + Xt = Xt.drop(columns=name) break # Remove features with too low variance if self.max_repeated is not None: - for name, column in X.select_dtypes(exclude="number").items(): + for name, column in Xt.select_dtypes(exclude="number").items(): for category, count in column.value_counts().items(): if count >= max_repeated: - self._low_variance[name] = (category, 100.0 * count / len(X)) - X = X.drop(columns=name) + self._low_variance[name] = (category, 100.0 * count / len(Xt)) + Xt = Xt.drop(columns=name) break # Remove features with too high correlation self.collinear = pd.DataFrame(columns=["drop", "corr_feature", "corr_value"]) if self.max_correlation: # Get the Pearson correlation coefficient matrix - if y is None: - corr_X = X.corr() + if yt is None: + corr_X = Xt.corr() else: - corr_matrix = merge(X, y).corr() + corr_matrix = merge(Xt, yt).corr() corr_X, corr_y = corr_matrix.iloc[:-1, :-1], corr_matrix.iloc[:-1, -1] corr = {} @@ -1237,7 +1186,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): # Always finds himself with correlation 1 if len(corr[col]) > 1: - if y is None: + if yt is None: # Drop all but the first one to_drop.extend(list(corr[col][1:].index)) else: @@ -1262,7 +1211,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ignore_index=True, ) - X = X.drop(columns=self.collinear_["drop"].tolist()) + Xt = Xt.drop(columns=self.collinear_["drop"].tolist()) if self.strategy is None: return self # Exit feature_engineering @@ -1292,15 +1241,14 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): else: solver = self.solver - check_y() - self._estimator = SelectKBest(solver, k=self._n_features).fit(X, y) + self._estimator = SelectKBest(solver, k=self._n_features).fit(Xt, yt) elif self.strategy == "pca": - if not is_sparse(X): + if not is_sparse(Xt): # PCA requires the features to be scaled - if not check_scaling(X): - self.scaler_ = Scaler() - X = self.scaler_.fit_transform(X) + if not check_scaling(Xt): + self.scaler_ = Scaler(device=self.device, engine=self.engine) + Xt = cast(pd.DataFrame, self.scaler_.fit_transform(Xt)) estimator = self._get_est_class("PCA", "decomposition") solver_param = "svd_solver" @@ -1316,11 +1264,11 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): # The PCA and TruncatedSVD both get all possible components to use # for the plots (n_components must be < n_features and <= n_rows) self._estimator = estimator( - n_components=min(len(X), X.shape[1] - 1), + n_components=min(len(Xt), Xt.shape[1] - 1), **{solver_param: solver}, random_state=self.random_state, **self.kwargs, - ).fit(X) + ).fit(Xt) self._estimator._comps = min(self._estimator.components_.shape[0], self._n_features) @@ -1342,7 +1290,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): **kwargs, ) if prefit: - if list(getattr(solver, "feature_names_in_", [])) != list(X.columns): + if list(getattr(solver, "feature_names_in_", [])) != list(Xt.columns): raise ValueError( "Invalid value for the solver parameter. The " f"{solver.__class__.__name__} estimator " @@ -1350,13 +1298,10 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ) self._estimator.estimator_ = solver else: - check_y() - self._estimator.fit(X, y) + self._estimator.fit(Xt, yt) elif self.strategy in ("sfs", "rfe", "rfecv"): if self.strategy == "sfs": - check_y() - if self.kwargs.get("scoring"): kwargs["scoring"] = get_custom_scorer(self.kwargs["scoring"]) @@ -1368,8 +1313,6 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ) elif self.strategy == "rfe": - check_y() - self._estimator = RFE( estimator=solver, n_features_to_select=self._n_features, @@ -1377,13 +1320,11 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ) elif self.strategy == "rfecv": - check_y() - if self.kwargs.get("scoring"): kwargs["scoring"] = get_custom_scorer(self.kwargs["scoring"]) # Invert n_features to select them all (default option) - if self._n_features == X.shape[1]: + if self._n_features == Xt.shape[1]: self._n_features = 1 self._estimator = RFECV( @@ -1393,11 +1334,16 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): **kwargs, ) - with joblib.parallel_backend(backend=self.backend): - self._estimator.fit(X, y) + self._estimator.fit(Xt, yt) else: - check_y() + strategies = { + "pso": ParticleSwarmOptimization, + "hho": HarrisHawkOptimization, + "gwo": GreyWolfOptimization, + "dfo": DragonFlyOptimization, + "go": GeneticOptimization, + } # Either use a provided validation set or cross-validation over X if "X_valid" in kwargs: @@ -1411,7 +1357,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): "cannot be absent when X_valid is provided." ) else: - X_valid, y_valid = X, y + X_valid, y_valid = Xt, yt # Get scoring for default objective_function if "objective_function" not in kwargs: @@ -1419,7 +1365,8 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): kwargs["scoring"] = get_custom_scorer(kwargs["scoring"]) else: goal = Goal(0) if is_classifier(solver) else Goal(1) - task = goal.infer_task(y) + if yt is not None: + task = goal.infer_task(yt) if task is Task.binary_classification: kwargs["scoring"] = get_custom_scorer("f1") elif task.is_multiclass: @@ -1435,8 +1382,8 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): self._estimator.fit( model=solver, - X_train=X, - y_train=y, + X_train=Xt, + y_train=yt, X_valid=X_valid, y_valid=y_valid, verbose=self.verbose >= 2, @@ -1491,8 +1438,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> ] ) - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Transform the data. Parameters @@ -1500,7 +1446,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1509,6 +1455,10 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Performing feature selection ...", 1) # Remove features with too high variance @@ -1516,9 +1466,9 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: self._log( f" --> Feature {fx} was removed due to high variance. " f"Value {h_variance[0]} was the most repeated value with " - f"{h_variance[1]} ({h_variance[1] / len(X):.1f}%) occurrences.", 2, + f"{h_variance[1]} ({h_variance[1] / len(Xt):.1f}%) occurrences.", 2, ) - X = X.drop(columns=fx) + Xt = Xt.drop(columns=fx) # Remove features with too low variance for fx, l_variance in self._low_variance.items(): @@ -1526,7 +1476,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: f" --> Feature {fx} was removed due to low variance. Value " f"{l_variance[0]} repeated in {l_variance[1]:.1f}% of the rows.", 2, ) - X = X.drop(columns=fx) + Xt = Xt.drop(columns=fx) # Remove features with too high correlation for col in self.collinear_["drop"]: @@ -1534,34 +1484,34 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: f" --> Feature {col} was removed due to " "collinearity with another feature.", 2, ) - X = X.drop(columns=col) + Xt = Xt.drop(columns=col) # Perform selection based on strategy if self.strategy is None: - return X + return self._convert(Xt) elif self.strategy == "univariate": self._log( f" --> The univariate test selected " f"{self._n_features} features from the dataset.", 2, ) - for n, column in enumerate(X): + for n, column in enumerate(Xt): if not self.univariate_.get_support()[n]: self._log( f" --> Dropping feature {column} " f"(score: {self.univariate_.scores_[n]:.2f} " f"p-value: {self.univariate_.pvalues_[n]:.2f}).", 2, ) - X = X.drop(columns=column) + Xt = Xt.drop(columns=column) elif self.strategy == "pca": self._log(" --> Applying Principal Component Analysis...", 2) if self.scaler_: self._log(" --> Scaling features...", 2) - X = self.scaler_.transform(X) + Xt = cast(pd.DataFrame, self.scaler_.transform(Xt)) - X = self._estimator.transform(X).iloc[:, :self._estimator._comps] + Xt = self._estimator.transform(Xt).iloc[:, :self._estimator._comps] var = np.array(self._estimator.explained_variance_ratio_[:self._n_features]) self._log(f" --> Keeping {self._estimator._comps} components.", 2) @@ -1571,7 +1521,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: mask = self._estimator.get_support() self._log(f" --> {self.strategy} selected {sum(mask)} features from the dataset.", 2) - for n, column in enumerate(X): + for n, column in enumerate(Xt): if not mask[n]: if hasattr(self._estimator, "ranking_"): self._log( @@ -1580,7 +1530,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: ) else: self._log(f" --> Dropping feature {column}.", 2) - X = X.drop(columns=column) + Xt = Xt.drop(columns=column) else: # Advanced strategies self._log( @@ -1588,9 +1538,9 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: "features from the dataset.", 2, ) - for column in X: + for column in Xt: if column not in self._estimator.best_feature_list: self._log(f" --> Dropping feature {column}.", 2) - X = X.drop(columns=column) + Xt = Xt.drop(columns=column) - return X + return self._convert(Xt) diff --git a/atom/models/classreg.py b/atom/models/classreg.py index 9dfac9c04..02dffde14 100644 --- a/atom/models/classreg.py +++ b/atom/models/classreg.py @@ -7,6 +7,7 @@ from __future__ import annotations +from collections.abc import Mapping from typing import Any, ClassVar, cast import numpy as np @@ -22,7 +23,7 @@ from optuna.trial import Trial from atom.basemodel import BaseModel -from atom.utils.types import DataFrame, Pandas, Predictor +from atom.utils.types import Pandas, Predictor from atom.utils.utils import CatBMetric, Goal, LGBMetric, XGBMetric @@ -76,7 +77,7 @@ class AdaBoost(BaseModel): "regression": "sklearn.ensemble.AdaBoostRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -485,8 +486,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: def _fit_estimator( self, estimator: Predictor, - data: tuple[DataFrame, Pandas], - validation: tuple[DataFrame, Pandas] | None = None, + data: tuple[pd.DataFrame, Pandas], + validation: tuple[pd.DataFrame, Pandas] | None = None, trial: Trial | None = None, ): """Fit the estimator and perform in-training validation. @@ -734,7 +735,7 @@ class DecisionTree(BaseModel): "regression": "sklearn.tree.DecisionTreeRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -809,7 +810,7 @@ class Dummy(BaseModel): "regression": "sklearn.dummy.DummyRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -943,7 +944,7 @@ class ExtraTree(BaseModel): "regression": "sklearn.tree.ExtraTreeRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1038,7 +1039,7 @@ def _get_parameters(self, trial: Trial) -> dict: return params - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1229,7 +1230,7 @@ class GradientBoostingMachine(BaseModel): "regression": "sklearn.ensemble.GradientBoostingRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1376,7 +1377,7 @@ class HistGradientBoosting(BaseModel): "regression": "sklearn.ensemble.HistGradientBoostingRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1452,7 +1453,7 @@ class KNearestNeighbors(BaseModel): "regression": "sklearn.neighbors.KNeighborsRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1675,8 +1676,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: def _fit_estimator( self, estimator: Predictor, - data: tuple[DataFrame, Pandas], - validation: tuple[DataFrame, Pandas] | None = None, + data: tuple[pd.DataFrame, Pandas], + validation: tuple[pd.DataFrame, Pandas] | None = None, trial: Trial | None = None, ): """Fit the estimator and perform in-training validation. @@ -1951,7 +1952,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: else: return super()._get_est(params) - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2053,7 +2054,7 @@ def _get_parameters(self, trial: Trial) -> dict: return params - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2161,7 +2162,7 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]: return params - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2400,7 +2401,7 @@ class PassiveAggressive(BaseModel): "regression": "sklearn.linear_model.PassiveAggressiveRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2712,7 +2713,7 @@ def _get_parameters(self, trial: Trial) -> dict: return params - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2805,7 +2806,7 @@ class Ridge(BaseModel): "regression": "sklearn.linear_model.Ridge", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2878,7 +2879,7 @@ class StochasticGradientDescent(BaseModel): "regression": "sklearn.linear_model.SGDRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -3003,7 +3004,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: else: return super()._get_est(params) - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -3132,8 +3133,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: def _fit_estimator( self, estimator: Predictor, - data: tuple[DataFrame, Pandas], - validation: tuple[DataFrame, Pandas] | None = None, + data: tuple[pd.DataFrame, Pandas], + validation: tuple[pd.DataFrame, Pandas] | None = None, trial: Trial | None = None, ): """Fit the estimator and perform in-training validation. diff --git a/atom/models/custom.py b/atom/models/custom.py index 6c9e49495..ae10fe3be 100644 --- a/atom/models/custom.py +++ b/atom/models/custom.py @@ -5,7 +5,6 @@ """ -from functools import cached_property from typing import Any from atom.basemodel import BaseModel @@ -56,7 +55,7 @@ def fullname(self) -> str: """Return the estimator's class name.""" return self._est_class.__name__ - @cached_property + @property def _est_class(self) -> type[Predictor]: """Return the estimator's class.""" return self._est diff --git a/atom/models/ts.py b/atom/models/ts.py index 59f078f1b..f067eb5d9 100644 --- a/atom/models/ts.py +++ b/atom/models/ts.py @@ -7,6 +7,7 @@ from __future__ import annotations +from collections.abc import Mapping from logging import ERROR, WARNING, getLogger from typing import Any, ClassVar @@ -161,7 +162,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: """ return super()._get_est({"suppress_warnings": self.warnings == "ignore"} | params) - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -835,7 +836,7 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]: return {"stl_kwargs": self._est_params.get("stl_kwargs", {}) | params} - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1207,7 +1208,7 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]: return params - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1652,7 +1653,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: """ return super()._get_est({"suppress_warnings": self.warnings == "ignore"} | params) - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns diff --git a/atom/nlp.py b/atom/nlp.py index 3e05e57fb..392124eb9 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -10,35 +10,31 @@ import re import unicodedata from string import punctuation +from typing import TYPE_CHECKING -import nltk import numpy as np import pandas as pd from beartype import beartype -from nltk.collocations import ( - BigramCollocationFinder, QuadgramCollocationFinder, - TrigramCollocationFinder, -) -from nltk.corpus import wordnet -from nltk.stem import SnowballStemmer, WordNetLemmatizer from sklearn.base import OneToOneFeatureMixin -from sklearn.utils._set_output import _SetOutputMixin from sklearn.utils.validation import _check_feature_names_in from typing_extensions import Self from atom.data_cleaning import TransformerMixin from atom.utils.types import ( - Bool, DataFrame, Engine, FloatLargerZero, Pandas, Sequence, - VectorizerStarts, Verbose, bool_t, + Bool, Engine, FloatLargerZero, Sequence, VectorizerStarts, Verbose, + XConstructor, XReturn, YConstructor, bool_t, ) from atom.utils.utils import ( - check_is_fitted, check_nltk_module, composed, crash, get_corpus, is_sparse, - merge, method_to_log, to_df, + check_is_fitted, check_nltk_module, get_corpus, is_sparse, merge, to_df, ) +if TYPE_CHECKING: + from nltk.corpus import wordnet + + @beartype -class TextCleaner(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class TextCleaner(TransformerMixin, OneToOneFeatureMixin): r"""Applies standard text cleaning to the corpus. Transformations include normalizing characters and dropping @@ -193,8 +189,7 @@ def __init__( self.regex_number = regex_number self.drop_punctuation = drop_punctuation - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Apply the transformations to the data. Parameters @@ -204,7 +199,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -245,28 +240,29 @@ def drop_regex(regex: str): Regex pattern to replace. """ - if isinstance(X[corpus].iloc[0], str): - X[corpus] = X[corpus].str.replace(regex, "", regex=True) + if isinstance(Xt[corpus].iloc[0], str): + Xt[corpus] = Xt[corpus].str.replace(regex, "", regex=True) else: - X[corpus] = X[corpus].apply(lambda x: [re.sub(regex, "", w) for w in x]) + Xt[corpus] = Xt[corpus].apply(lambda x: [re.sub(regex, "", w) for w in x]) - corpus = get_corpus(X) + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + corpus = get_corpus(Xt) self._log("Cleaning the corpus...", 1) if self.decode: - if isinstance(X[corpus].iloc[0], str): - X[corpus] = X[corpus].apply(lambda x: to_ascii(x)) + if isinstance(Xt[corpus].iloc[0], str): + Xt[corpus] = Xt[corpus].apply(lambda x: to_ascii(x)) else: - X[corpus] = X[corpus].apply(lambda doc: [to_ascii(str(w)) for w in doc]) + Xt[corpus] = Xt[corpus].apply(lambda doc: [to_ascii(str(w)) for w in doc]) self._log(" --> Decoding unicode characters to ascii.", 2) if self.lower_case: self._log(" --> Converting text to lower case.", 2) - if isinstance(X[corpus].iloc[0], str): - X[corpus] = X[corpus].str.lower() + if isinstance(Xt[corpus].iloc[0], str): + Xt[corpus] = Xt[corpus].str.lower() else: - X[corpus] = X[corpus].apply(lambda doc: [str(w).lower() for w in doc]) + Xt[corpus] = Xt[corpus].apply(lambda doc: [str(w).lower() for w in doc]) if self.drop_email: if not self.regex_email: @@ -306,21 +302,21 @@ def drop_regex(regex: str): if self.drop_punctuation: self._log(" --> Dropping punctuation from the text.", 2) trans_table = str.maketrans("", "", punctuation) # Translation table - if isinstance(X[corpus].iloc[0], str): + if isinstance(Xt[corpus].iloc[0], str): func = lambda doc: doc.translate(trans_table) else: func = lambda doc: [str(w).translate(trans_table) for w in doc] - X[corpus] = X[corpus].apply(func) + Xt[corpus] = Xt[corpus].apply(func) # Drop empty tokens from every document - if not isinstance(X[corpus].iloc[0], str): - X[corpus] = X[corpus].apply(lambda doc: [w for w in doc if w]) + if not isinstance(Xt[corpus].iloc[0], str): + Xt[corpus] = Xt[corpus].apply(lambda doc: [w for w in doc if w]) - return X + return self._convert(Xt) @beartype -class TextNormalizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class TextNormalizer(TransformerMixin, OneToOneFeatureMixin): """Normalize the corpus. Convert words to a more uniform standard. The transformations @@ -444,8 +440,7 @@ def __init__( self.stem = stem self.lemmatize = lemmatize - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Normalize the text. Parameters @@ -455,7 +450,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -488,31 +483,36 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: else: # "NN", "NNS", "NNP", "NNPS" return wordnet.NOUN - corpus = get_corpus(X) + from nltk import pos_tag + from nltk.corpus import stopwords, wordnet + from nltk.stem import SnowballStemmer, WordNetLemmatizer + + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + corpus = get_corpus(Xt) self._log("Normalizing the corpus...", 1) # If the corpus is not tokenized, separate by space - if isinstance(X[corpus].iloc[0], str): - X[corpus] = X[corpus].apply(lambda row: row.split()) + if isinstance(Xt[corpus].iloc[0], str): + Xt[corpus] = Xt[corpus].apply(lambda row: row.split()) - stopwords = set() + stop_words = set() if self.stopwords: if isinstance(self.stopwords, bool_t): self.stopwords = "english" # Get stopwords from the NLTK library check_nltk_module("corpora/stopwords", quiet=self.verbose < 2) - stopwords = set(nltk.corpus.stopwords.words(self.stopwords.lower())) + stop_words = set(stopwords.words(self.stopwords.lower())) # Join predefined with customs stopwords if self.custom_stopwords is not None: - stopwords = stopwords | set(self.custom_stopwords) + stop_words = stop_words | set(self.custom_stopwords) - if stopwords: + if stop_words: self._log(" --> Dropping stopwords.", 2) - f = lambda row: [word for word in row if word not in stopwords] - X[corpus] = X[corpus].apply(f) + f = lambda row: [word for word in row if word not in stop_words] + Xt[corpus] = Xt[corpus].apply(f) if self.stem: if isinstance(self.stem, bool_t): @@ -520,7 +520,7 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: self._log(" --> Applying stemming.", 2) ss = SnowballStemmer(language=self.stem.lower()) - X[corpus] = X[corpus].apply(lambda row: [ss.stem(word) for word in row]) + Xt[corpus] = Xt[corpus].apply(lambda row: [ss.stem(word) for word in row]) if self.lemmatize: self._log(" --> Applying lemmatization.", 2) @@ -529,14 +529,14 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: check_nltk_module("corpora/omw-1.4", quiet=self.verbose < 2) wnl = WordNetLemmatizer() - f = lambda row: [wnl.lemmatize(w, pos(tag)) for w, tag in nltk.pos_tag(row)] - X[corpus] = X[corpus].apply(f) + f = lambda row: [wnl.lemmatize(w, pos(tag)) for w, tag in pos_tag(row)] + Xt[corpus] = Xt[corpus].apply(f) - return X + return self._convert(Xt) @beartype -class Tokenizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Tokenizer(TransformerMixin, OneToOneFeatureMixin): """Tokenize the corpus. Convert documents into sequences of words. Additionally, @@ -664,8 +664,7 @@ def __init__( self.trigram_freq = trigram_freq self.quadgram_freq = quadgram_freq - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Tokenize the text. Parameters @@ -675,7 +674,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -712,24 +711,28 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: return row_c[2:-2].split(sep) - corpus = get_corpus(X) + import nltk.collocations as collocations + from nltk import word_tokenize + + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + corpus = get_corpus(Xt) self._log("Tokenizing the corpus...", 1) - if isinstance(X[corpus].iloc[0], str): + if isinstance(Xt[corpus].iloc[0], str): check_nltk_module("tokenizers/punkt", quiet=self.verbose < 2) - X[corpus] = X[corpus].apply(lambda row: nltk.word_tokenize(row)) + Xt[corpus] = Xt[corpus].apply(lambda row: word_tokenize(row)) ngrams = { - "bigrams": BigramCollocationFinder, - "trigrams": TrigramCollocationFinder, - "quadgrams": QuadgramCollocationFinder, + "bigrams": collocations.BigramCollocationFinder, + "trigrams": collocations.TrigramCollocationFinder, + "quadgrams": collocations.QuadgramCollocationFinder, } for attr, finder in ngrams.items(): if frequency := getattr(self, f"{attr[:-1]}_freq"): # Search for all n-grams in the corpus - ngram_fd = finder.from_documents(X[corpus]).ngram_fd + ngram_fd = finder.from_documents(Xt[corpus]).ngram_fd if frequency < 1: frequency = int(frequency * len(ngram_fd)) @@ -740,7 +743,7 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: if freq >= frequency: occur += 1 counts += freq - X[corpus] = X[corpus].apply(replace_ngrams, args=(ngram,)) + Xt[corpus] = Xt[corpus].apply(replace_ngrams, args=(ngram,)) rows.append({attr[:-1]: "_".join(ngram), "frequency": freq}) if rows: @@ -752,11 +755,11 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: else: self._log(f" --> No {attr} found in the corpus.", 2) - return X + return self._convert(Xt) @beartype -class Vectorizer(TransformerMixin, _SetOutputMixin): +class Vectorizer(TransformerMixin): """Vectorize text data. Transform the corpus into meaningful vectors of numbers. The @@ -792,24 +795,12 @@ class Vectorizer(TransformerMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": - - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -923,8 +914,7 @@ def _get_corpus_columns(self) -> list[str]: "The get_feature_names_out method is not available for strategy='hashing'." ) - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -934,7 +924,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -943,11 +933,15 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: Estimator instance. """ - self._corpus = get_corpus(X) + Xt = to_df(X) + self._corpus = get_corpus(Xt) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) # Convert a sequence of tokens to space separated string - if not isinstance(X[self._corpus].iloc[0], str): - X[self._corpus] = X[self._corpus].apply(lambda row: " ".join(row)) + if not isinstance(Xt[self._corpus].iloc[0], str): + Xt[self._corpus] = Xt[self._corpus].apply(lambda row: " ".join(row)) strategies = { "bow": "CountVectorizer", @@ -966,7 +960,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: self._estimator.set_output(transform="default") self._log("Fitting Vectorizer...", 1) - self._estimator.fit(X[self._corpus]) + self._estimator.fit(Xt[self._corpus]) # Add the estimator as attribute to the instance setattr(self, f"{self.strategy}_", self._estimator) @@ -994,8 +988,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> og_columns = [c for c in self.feature_names_in_ if c != self._corpus] return np.array(og_columns + self._get_corpus_columns()) - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Vectorize the text. Parameters @@ -1005,7 +998,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1014,14 +1007,18 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed corpus. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Vectorizing the corpus...", 1) # Convert a sequence of tokens to space-separated string - if not isinstance(X[self._corpus].iloc[0], str): - X[self._corpus] = X[self._corpus].apply(lambda row: " ".join(row)) + if not isinstance(Xt[self._corpus].iloc[0], str): + Xt[self._corpus] = Xt[self._corpus].apply(lambda row: " ".join(row)) - matrix = self._estimator.transform(X[self._corpus]) - X = X.drop(columns=self._corpus) # Drop original corpus column + matrix = self._estimator.transform(Xt[self._corpus]) + Xt = Xt.drop(columns=self._corpus) # Drop original corpus column if "sklearn" not in self._estimator.__class__.__module__: matrix = matrix.get() # Convert cupy sparse array back to scipy @@ -1029,7 +1026,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: if not self.return_sparse: self._log(" --> Converting the output to a full array.", 2) matrix = matrix.toarray() - elif not X.empty and not is_sparse(X): + elif not Xt.empty and not is_sparse(Xt): # Raise if there are other columns that are non-sparse raise ValueError( "Invalid value for the return_sparse parameter. The value must " @@ -1042,4 +1039,4 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: # Hashing has no words to put as column names columns = [f"hash{i}" for i in range(1, matrix.shape[1] + 1)] - return merge(X, to_df(matrix, index=X.index, columns=columns)) + return self._convert(merge(Xt, to_df(matrix, index=Xt.index, columns=columns))) diff --git a/atom/pipeline.py b/atom/pipeline.py index e09c3578f..d4d57b391 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -9,9 +9,10 @@ from collections.abc import Iterator from itertools import islice -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload import numpy as np +import pandas as pd from joblib import Memory from sklearn.base import clone from sklearn.pipeline import Pipeline as SkPipeline @@ -22,19 +23,26 @@ from sklearn.utils.metaestimators import available_if from sklearn.utils.validation import check_memory from sktime.forecasting.base import BaseForecaster -from sktime.proba.normal import Normal from typing_extensions import Self from atom.utils.types import ( - Bool, DataFrame, Estimator, FHConstructor, Float, Pandas, Scalar, Sequence, - Verbose, XConstructor, YConstructor, + Bool, EngineDataOptions, EngineTuple, Estimator, FHConstructor, Float, + Pandas, Scalar, Sequence, Verbose, XConstructor, XReturn, YConstructor, + YReturn, ) from atom.utils.utils import ( - NotFittedError, adjust_verbosity, check_is_fitted, fit_one, - fit_transform_one, transform_one, variable_return, + NotFittedError, adjust, check_is_fitted, fit_one, fit_transform_one, to_df, + to_tabular, transform_one, variable_return, ) +if TYPE_CHECKING: + from sktime.proba.normal import Normal + + +T = TypeVar("T") + + class Pipeline(SkPipeline): """Pipeline of transforms with a final estimator. @@ -55,6 +63,7 @@ class Pipeline(SkPipeline): and additionally: - Can initialize with an empty pipeline. + - Always returns 'pandas' objects. - Accepts transformers that drop rows. - Accepts transformers that only are fitted on a subset of the provided dataset. @@ -221,6 +230,35 @@ def _can_inverse_transform(self) -> bool: for _, _, est in self._iter() ) + @overload + def _convert(self, obj: Literal[None]) -> None: ... + + @overload + def _convert(self, obj: pd.DataFrame) -> XReturn: ... + + @overload + def _convert(self, obj: pd.Series) -> YReturn: ... + + def _convert(self, obj: Pandas | None) -> YReturn | None: + """Convert data to the type set in the data engine. + + Parameters + ---------- + obj: pd.Series, pd.DataFrame or None + Object to convert. If None, return as is. + + Returns + ------- + object + Converted data. + + """ + # Only apply transformations when the engine is defined + if hasattr(self, "_engine") and isinstance(obj, pd.Series | pd.DataFrame): + return self._engine.data_engine.convert(obj) + else: + return obj + def _iter( self, *, @@ -273,17 +311,17 @@ def _fit( X: XConstructor | None = None, y: YConstructor | None = None, routed_params: dict[str, Bunch] | None = None, - ) -> tuple[DataFrame | None, Pandas | None]: + ) -> tuple[pd.DataFrame | None, Pandas | None]: """Get data transformed through the pipeline. Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. None if the pipeline only uses y. + `X` is ignored. None if the pipeline only uses y. - y: dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. routed_params: dict or None, default=None Metadata parameters routed for the fit method. @@ -300,6 +338,9 @@ def _fit( self.steps: list[tuple[str, Estimator]] = list(self.steps) self._validate_steps() + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + for step, name, transformer in self._iter( with_final=False, filter_passthrough=False, filter_train_only=False ): @@ -318,14 +359,14 @@ def _fit( if hasattr(transformer, attr): setattr(cloned, attr, getattr(transformer, attr)) - with adjust_verbosity(cloned, self._verbose): + with adjust(cloned, verbose=self._verbose): # Fit or load the current estimator from cache # Type ignore because routed_params is never None but # the signature of _fit needs to comply with sklearn's - X, y, fitted_transformer = self._mem_fit_transform( + Xt, yt, fitted_transformer = self._mem_fit_transform( transformer=cloned, - X=X, - y=y, + X=Xt, + y=yt, message=self._log_message(step), **routed_params[name].fit_transform, # type: ignore[index] ) @@ -334,7 +375,7 @@ def _fit( # estimator (necessary when loading from cache) self.steps[step] = (name, fitted_transformer) - return X, y + return Xt, yt def get_metadata_routing(self): """Get metadata routing of this object. @@ -428,10 +469,10 @@ def fit( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. - y: dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. **params Parameters requested and accepted by steps. Each step must @@ -445,15 +486,15 @@ def fit( """ routed_params = self._check_method_params(method="fit", props=params) - X, y = self._fit(X, y, routed_params) + Xt, yt = self._fit(X, y, routed_params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator is not None and self._final_estimator != "passthrough": - with adjust_verbosity(self._final_estimator, self._verbose): + with adjust(self._final_estimator, verbose=self._verbose): self._mem_fit( estimator=self._final_estimator, - X=X, - y=y, + X=Xt, + y=yt, **routed_params[self.steps[-1][0]].fit, ) @@ -465,7 +506,7 @@ def fit_transform( X: XConstructor | None = None, y: YConstructor | None = None, **params, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Fit the pipeline and transform the data. Call `fit` followed by `transform` on each transformer in the @@ -479,11 +520,11 @@ def fit_transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. None + `X` is ignored. None if the estimator only uses y. - y: dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. **params Parameters requested and accepted by steps. Each step must @@ -500,21 +541,21 @@ def fit_transform( """ routed_params = self._check_method_params(method="fit_transform", props=params) - X, y = self._fit(X, y, routed_params) + Xt, yt = self._fit(X, y, routed_params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator is None or self._final_estimator == "passthrough": - return variable_return(X, y) + return variable_return(Xt, yt) - with adjust_verbosity(self._final_estimator, self._verbose): - X, y, _ = self._mem_fit_transform( + with adjust(self._final_estimator, verbose=self._verbose): + Xt, yt, _ = self._mem_fit_transform( transformer=self._final_estimator, - X=X, - y=y, + X=Xt, + y=yt, **routed_params[self.steps[-1][0]].fit_transform, ) - return variable_return(X, y) + return variable_return(self._convert(Xt), self._convert(yt)) @available_if(_can_transform) def transform( @@ -524,7 +565,7 @@ def transform( *, filter_train_only: Bool = True, **params, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Transform the data. Call `transform` on each transformer in the pipeline. The @@ -538,10 +579,10 @@ def transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. None if the pipeline only uses y. + `X` is ignored. None if the pipeline only uses y. - y: dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. filter_train_only: bool, default=True Whether to exclude transformers that should only be used @@ -564,19 +605,22 @@ def transform( if X is None and y is None: raise ValueError("X and y cannot be both None.") + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + _raise_for_params(params, self, "transform") routed_params = process_routing(self, "transform", **params) for _, name, transformer in self._iter(filter_train_only=filter_train_only): - with adjust_verbosity(transformer, self._verbose): - X, y = self._mem_transform( + with adjust(transformer, verbose=self._verbose): + Xt, yt = self._mem_transform( transformer=transformer, - X=X, - y=y, + X=Xt, + y=yt, **routed_params[name].transform, ) - return variable_return(X, y) + return variable_return(self._convert(Xt), self._convert(yt)) @available_if(_can_inverse_transform) def inverse_transform( @@ -586,7 +630,7 @@ def inverse_transform( *, filter_train_only: Bool = True, **params, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Inverse transform for each step in a reverse order. All estimators in the pipeline must implement the @@ -596,10 +640,10 @@ def inverse_transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. None if the pipeline only uses y. + `X` is ignored. None if the pipeline only uses y. - y: dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + y: sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. filter_train_only: bool, default=True Whether to exclude transformers that should only be used @@ -622,21 +666,24 @@ def inverse_transform( if X is None and y is None: raise ValueError("X and y cannot be both None.") + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + _raise_for_params(params, self, "inverse_transform") routed_params = process_routing(self, "inverse_transform", **params) reverse_iter = reversed(list(self._iter(filter_train_only=filter_train_only))) for _, name, transformer in reverse_iter: - with adjust_verbosity(transformer, self._verbose): - X, y = self._mem_transform( + with adjust(transformer, verbose=self._verbose): + Xt, yt = self._mem_transform( transformer=transformer, - X=X, - y=y, + X=Xt, + y=yt, method="inverse_transform", **routed_params[name].inverse_transform, ) - return variable_return(X, y) + return variable_return(self._convert(Xt), self._convert(yt)) @available_if(_final_estimator_has("decision_function")) def decision_function(self, X: XConstructor, **params) -> np.ndarray: @@ -661,20 +708,22 @@ def decision_function(self, X: XConstructor, **params) -> np.ndarray: multiclass classification tasks. """ + Xt = to_df(X) + _raise_for_params(params, self, "decision_function") routed_params = process_routing(self, "decision_function", **params) for _, name, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): - X, _ = self._mem_transform( + with adjust(transformer, verbose=self._verbose): + Xt, _ = self._mem_transform( transformer=transformer, - X=X, + X=Xt, **routed_params.get(name, {}).get("transform", {}), ) return self.steps[-1][1].decision_function( - X, **routed_params.get(self.steps[-1][0], {}).get("decision_function", {}) + Xt, **routed_params.get(self.steps[-1][0], {}).get("decision_function", {}) ) @available_if(_final_estimator_has("predict")) @@ -715,19 +764,21 @@ def predict( if X is None and fh is None: raise ValueError("X and fh cannot be both None.") + Xt = to_df(X) + routed_params = process_routing(self, "predict", **params) for _, name, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): - X, _ = self._mem_transform(transformer, X, **routed_params[name].transform) + with adjust(transformer, verbose=self._verbose): + Xt, _ = self._mem_transform(transformer, Xt, **routed_params[name].transform) if isinstance(self._final_estimator, BaseForecaster): if fh is None: raise ValueError("The fh parameter cannot be None for forecasting estimators.") - return self.steps[-1][1].predict(fh=fh, X=X) + return self.steps[-1][1].predict(fh=fh, X=Xt) else: - return self.steps[-1][1].predict(X, **routed_params[self.steps[-1][0]].predict) + return self.steps[-1][1].predict(Xt, **routed_params[self.steps[-1][0]].predict) @available_if(_final_estimator_has("predict_interval")) def predict_interval( @@ -736,7 +787,7 @@ def predict_interval( X: XConstructor | None = None, *, coverage: Float | Sequence[Float] = 0.9, - ) -> Pandas: + ) -> pd.DataFrame: """Transform, then predict_quantiles of the final estimator. Parameters @@ -757,11 +808,13 @@ def predict_interval( Computed interval forecasts. """ + Xt = to_df(X) + for _, _, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): - X, y = self._mem_transform(transformer, X) + with adjust(transformer, verbose=self._verbose): + Xt, _ = self._mem_transform(transformer, Xt) - return self.steps[-1][1].predict_interval(fh=fh, X=X, coverage=coverage) + return self.steps[-1][1].predict_interval(fh=fh, X=Xt, coverage=coverage) @available_if(_final_estimator_has("predict_log_proba")) def predict_log_proba(self, X: XConstructor, **params) -> np.ndarray: @@ -784,14 +837,16 @@ def predict_log_proba(self, X: XConstructor, **params) -> np.ndarray: n_classes) or a list of arrays for [multioutput tasks][]. """ + Xt = to_df(X) + routed_params = process_routing(self, "predict_log_proba", **params) for _, name, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): - X, _ = self._mem_transform(transformer, X, **routed_params[name].transform) + with adjust(transformer, verbose=self._verbose): + Xt, _ = self._mem_transform(transformer, Xt, **routed_params[name].transform) return self.steps[-1][1].predict_log_proba( - X, **routed_params[self.steps[-1][0]].predict_log_proba + Xt, **routed_params[self.steps[-1][0]].predict_log_proba ) @available_if(_final_estimator_has("predict_proba")) @@ -838,20 +893,22 @@ def predict_proba( if X is None and fh is None: raise ValueError("X and fh cannot be both None.") + Xt = to_df(X) + routed_params = process_routing(self, "predict_proba", **params) for _, name, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): - X, _ = self._mem_transform(transformer, X, **routed_params[name].transform) + with adjust(transformer, verbose=self._verbose): + Xt, _ = self._mem_transform(transformer, Xt, **routed_params[name].transform) if isinstance(self._final_estimator, BaseForecaster): if fh is None: raise ValueError("The fh parameter cannot be None for forecasting estimators.") - return self.steps[-1][1].predict_proba(fh=fh, X=X, marginal=marginal) + return self.steps[-1][1].predict_proba(fh=fh, X=Xt, marginal=marginal) else: return self.steps[-1][1].predict_proba( - X, **routed_params[self.steps[-1][0]].predict_proba + Xt, **routed_params[self.steps[-1][0]].predict_proba ) @available_if(_final_estimator_has("predict_quantiles")) @@ -883,11 +940,13 @@ def predict_quantiles( Computed quantile forecasts. """ + Xt = to_df(X) + for _, _, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): - X, y = self._mem_transform(transformer, X) + with adjust(transformer, verbose=self._verbose): + Xt, _ = self._mem_transform(transformer, Xt) - return self.steps[-1][1].predict_quantiles(fh=fh, X=X, alpha=alpha) + return self.steps[-1][1].predict_quantiles(fh=fh, X=Xt, alpha=alpha) @available_if(_final_estimator_has("predict_residuals")) def predict_residuals( @@ -912,11 +971,14 @@ def predict_residuals( n_targets) for [multivariate][] tasks. """ + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + for _, _, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): - X, y = self._mem_transform(transformer, X, y) + with adjust(transformer, verbose=self._verbose): + Xt, yt = self._mem_transform(transformer, Xt, yt) - return self.steps[-1][1].predict_residuals(y=y, X=X) + return self.steps[-1][1].predict_residuals(y=yt, X=Xt) @available_if(_final_estimator_has("predict_var")) def predict_var( @@ -925,7 +987,7 @@ def predict_var( X: XConstructor | None = None, *, cov: Bool = False, - ) -> DataFrame: + ) -> pd.DataFrame: """Transform, then predict_var of the final estimator. Parameters @@ -947,11 +1009,49 @@ def predict_var( Computed variance forecasts. """ + Xt = to_df(X) + for _, _, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): - X, _ = self._mem_transform(transformer, X) + with adjust(transformer, verbose=self._verbose): + Xt, _ = self._mem_transform(transformer, Xt) + + return self.steps[-1][1].predict_var(fh=fh, X=Xt, cov=cov) - return self.steps[-1][1].predict_var(fh=fh, X=X, cov=cov) + def set_output(self, *, transform: EngineDataOptions | None = None) -> Self: + """Set output container. + + See sklearn's [user guide][set_output] on how to use the + `set_output` API. See [here][data-acceleration] a description + of the choices. + + Parameters + ---------- + transform: str or None, default=None + Configure the output of the `transform`, `fit_transform`, + and `inverse_transform` method. If None, the configuration + is not changed. Choose from: + + - "numpy" + - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" + - "pyarrow" + - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" + + Returns + ------- + Self + Estimator instance. + + """ + if transform is not None: + self._engine = EngineTuple(data=transform) + + return self @available_if(_final_estimator_has("score")) def score( @@ -971,7 +1071,7 @@ def score( Feature set with shape=(n_samples, n_features). Can only be `None` for [forecast][time-series] tasks. - y: dict, sequence, dataframe or None, default=None + y: sequence, dataframe-like or None, default=None Target values corresponding to `X`. fh: int, sequence, [ForecastingHorizon][] or None, default=None @@ -992,6 +1092,9 @@ def score( if X is None and y is None: raise ValueError("X and y cannot be both None.") + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + # Drop sample weights if sktime estimator if not isinstance(self._final_estimator, BaseForecaster): params["sample_weight"] = sample_weight @@ -999,10 +1102,10 @@ def score( routed_params = process_routing(self, "score", **params) for _, name, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): - X, y = self._mem_transform(transformer, X, y, **routed_params[name].transform) + with adjust(transformer, verbose=self._verbose): + Xt, yt = self._mem_transform(transformer, Xt, yt, **routed_params[name].transform) if isinstance(self._final_estimator, BaseForecaster): - return self.steps[-1][1].score(y=y, X=X, fh=fh) + return self.steps[-1][1].score(y=yt, X=Xt, fh=fh) else: - return self.steps[-1][1].score(X, y, **routed_params[self.steps[-1][0]].score) + return self.steps[-1][1].score(Xt, yt, **routed_params[self.steps[-1][0]].score) diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py index 5ea7aa3ca..d23b16e82 100644 --- a/atom/plots/baseplot.py +++ b/atom/plots/baseplot.py @@ -15,6 +15,7 @@ import matplotlib.pyplot as plt import numpy as np +import pandas as pd import plotly.express as px import plotly.graph_objects as go from beartype import beartype @@ -25,9 +26,9 @@ from atom.plots.basefigure import BaseFigure from atom.utils.constants import PALETTE from atom.utils.types import ( - Bool, DataFrame, FloatLargerZero, FloatZeroToOneExc, Index, Int, - IntLargerZero, Legend, MetricSelector, Model, ModelsSelector, PlotBackend, - RowSelector, Scalar, Sequence, int_t, sequence_t, + Bool, FloatLargerZero, FloatZeroToOneExc, Int, IntLargerZero, Legend, + MetricSelector, Model, ModelsSelector, Pandas, PlotBackend, RowSelector, + Scalar, Sequence, int_t, sequence_t, ) from atom.utils.utils import ( Aesthetics, check_is_fitted, composed, crash, get_custom_scorer, lst, @@ -139,7 +140,7 @@ def marker_size(self, value: FloatLargerZero): # Methods ====================================================== >> @staticmethod - def _get_plot_index(df: DataFrame) -> Index: + def _get_plot_index(obj: Pandas) -> pd.Index: """Return the dataset's index in a plottable format. Plotly does not accept all index formats (e.g., pd.Period), @@ -148,19 +149,19 @@ def _get_plot_index(df: DataFrame) -> Index: Parameters ---------- - df: dataframe + obj: pd.Series or pd.DataFrame Data set to get the index from. Returns ------- - index + pd.Index Index in an acceptable format. """ - if hasattr(df.index, "to_timestamp"): - return df.index.to_timestamp() + if hasattr(obj.index, "to_timestamp"): + return obj.index.to_timestamp() else: - return df.index + return obj.index @staticmethod def _get_show(show: IntLargerZero | None, maximum: IntLargerZero = 200) -> Int: diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py index 215bb488d..f8e2ceae5 100644 --- a/atom/plots/dataplot.py +++ b/atom/plots/dataplot.py @@ -30,8 +30,8 @@ from atom.plots.baseplot import BasePlot from atom.utils.constants import PALETTE from atom.utils.types import ( - Bool, ColumnSelector, DataFrame, Int, IntLargerZero, Legend, PACFMethods, - RowSelector, Segment, Sequence, Series, TargetSelector, + Bool, ColumnSelector, Int, IntLargerZero, Legend, PACFMethods, RowSelector, + Segment, Sequence, TargetSelector, ) from atom.utils.utils import ( check_dependency, crash, divide, get_corpus, has_task, lst, @@ -540,7 +540,7 @@ def plot_components( @crash def plot_correlation( self, - columns: Segment | Sequence[Int | str] | DataFrame | None = None, + columns: Segment | Sequence[Int | str] | pd.DataFrame | None = None, method: Literal["pearson", "kendall", "spearman"] = "pearson", *, title: str | dict[str, Any] | None = None, @@ -1223,7 +1223,7 @@ def plot_ngrams( """ - def get_text(column: Series) -> Series: + def get_text(column: pd.Series) -> pd.Series: """Get the complete corpus as sequence of tokens. Parameters @@ -1862,7 +1862,7 @@ def plot_qq( @crash def plot_relationships( self, - columns: Segment | Sequence[Int | str] | DataFrame = (0, 1, 2), + columns: Segment | Sequence[Int | str] | pd.DataFrame = (0, 1, 2), *, title: str | dict[str, Any] | None = None, legend: Legend | dict[str, Any] | None = None, diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py index d23499cfe..038e7f86d 100644 --- a/atom/plots/hyperparametertuningplot.py +++ b/atom/plots/hyperparametertuningplot.py @@ -14,6 +14,7 @@ from typing import Any import numpy as np +import pandas as pd import plotly.graph_objects as go from optuna.importance import FanovaImportanceEvaluator from optuna.trial import TrialState @@ -32,7 +33,7 @@ int_t, segment_t, ) from atom.utils.utils import ( - bk, check_dependency, crash, divide, get_segment, it, lst, rnd, + check_dependency, crash, divide, get_segment, it, lst, rnd, ) @@ -244,8 +245,8 @@ def plot_edf( models_c = self._check_hyperparams(models_c) metric_c = self._get_metric(metric) - x_min = bk.concat([m.trials[metric_c] for m in models_c]).min(axis=None) - x_max = bk.concat([m.trials[metric_c] for m in models_c]).max(axis=None) + x_min = pd.concat([m.trials[metric_c] for m in models_c]).min(axis=None) + x_max = pd.concat([m.trials[metric_c] for m in models_c]).max(axis=None) x = np.linspace(x_min, x_max, 100) self._get_figure() diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py index e3dcf6a68..cd0386a22 100644 --- a/atom/plots/predictionplot.py +++ b/atom/plots/predictionplot.py @@ -39,10 +39,10 @@ Bool, ColumnSelector, FloatZeroToOneExc, Int, IntLargerEqualZero, IntLargerFour, IntLargerZero, Kind, Legend, MetricConstructor, MetricSelector, ModelsSelector, RowSelector, Sequence, TargetSelector, - TargetsSelector, XSelector, index_t, + TargetsSelector, XConstructor, ) from atom.utils.utils import ( - Task, bk, check_canvas, check_dependency, check_empty, check_predict_proba, + Task, check_canvas, check_dependency, check_empty, check_predict_proba, crash, divide, get_custom_scorer, has_task, lst, rnd, ) @@ -832,7 +832,7 @@ def plot_errors( from atom.models import OrdinaryLeastSquares model = OrdinaryLeastSquares(goal=self._goal) - estimator = model._get_est({}).fit(bk.DataFrame(y_true), y_pred) + estimator = model._get_est({}).fit(pd.DataFrame(y_true), y_pred) self._draw_line( x=(x := np.linspace(y_true.min(), y_true.max(), 100)), @@ -1116,7 +1116,7 @@ def plot_forecast( self, models: ModelsSelector = None, fh: RowSelector | ForecastingHorizon = "dataset", - X: XSelector | None = None, + X: XConstructor | None = None, target: TargetSelector = 0, *, plot_insample: Bool = False, @@ -1232,18 +1232,20 @@ def plot_forecast( for m in models_c: if X is not None: - X = m.transform(X) - elif isinstance(fh, index_t): - X = m.branch._all.loc[fh] + Xt = m.transform(X) + elif isinstance(fh, pd.Index): + Xt = m.branch._all.loc[fh] + else: + Xt = X # Draw predictions and interval - y_pred = m.predict(fh=fh, X=check_empty(X)) + y_pred = m.predict(fh=fh, X=check_empty(Xt)) if self.task.is_multioutput: y_pred = y_pred[target_c] if not plot_insample: idx = y_pred.index.intersection(m.branch.train.index) - y_pred.loc[idx] = np.NaN # type: ignore[index] + y_pred.loc[idx] = np.NaN # type: ignore[call-overload] y_true = m.branch._all.loc[y_pred.index, target_c] @@ -1271,7 +1273,7 @@ def plot_forecast( if plot_interval: try: - y_interval = m.predict_interval(fh=fh, X=X) + y_interval = m.predict_interval(fh=fh, X=Xt) except (AttributeError, NotImplementedError): continue # Fails for some models like ES @@ -1887,7 +1889,7 @@ class is always the positive one. data = data.sample(500, random_state=self.random_state) explanation = m._shap.get_explanation(data, target_c) - shap = bk.DataFrame(explanation.values, columns=m.branch.features) + shap = pd.DataFrame(explanation.values, columns=m.branch.features) parshap[ds] = pd.Series(index=fxs, dtype=float) for fx in fxs: @@ -2134,7 +2136,7 @@ def plot_partial_dependence( axes.append((xaxis, yaxis)) # Compute averaged predictions - predictions = Parallel(n_jobs=self.n_jobs, backend=self.backend)( + predictions = Parallel(n_jobs=self.n_jobs)( delayed(partial_dependence)( estimator=m.estimator, X=m.branch.X_test, diff --git a/atom/training.py b/atom/training.py index 6b0fbe8fe..8cd5a97e0 100644 --- a/atom/training.py +++ b/atom/training.py @@ -371,6 +371,7 @@ class DirectClassifier(Direct): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -605,6 +606,7 @@ class DirectForecaster(Direct): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -835,6 +837,7 @@ class DirectRegressor(Direct): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -1075,6 +1078,7 @@ class SuccessiveHalvingClassifier(SuccessiveHalving): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -1312,6 +1316,7 @@ class SuccessiveHalvingForecaster(SuccessiveHalving): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -1546,6 +1551,7 @@ class SuccessiveHalvingRegressor(SuccessiveHalving): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -1792,6 +1798,7 @@ class TrainSizingClassifier(TrainSizing): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -2035,6 +2042,7 @@ class TrainSizingForecaster(TrainSizing): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -2274,6 +2282,7 @@ class TrainSizingRegressor(TrainSizing): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the diff --git a/atom/utils/patches.py b/atom/utils/patches.py index ad592220c..bc7b2bc2b 100644 --- a/atom/utils/patches.py +++ b/atom/utils/patches.py @@ -9,7 +9,6 @@ from collections.abc import Callable from copy import deepcopy -from functools import wraps from typing import Any from unittest.mock import patch @@ -22,7 +21,6 @@ from sklearn.ensemble._base import _fit_single_estimator from sklearn.model_selection._validation import _fit_and_score, _score from sklearn.utils import Bunch -from sklearn.utils._set_output import _wrap_method_output from sklearn.utils.multiclass import check_classification_targets from sktime.forecasting.compose import EnsembleForecaster as EF from sktime.forecasting.compose import StackingForecaster as SF @@ -36,24 +34,6 @@ # Functions ======================================================== >> -def wrap_method_output(f: Callable, method: str) -> Callable: - """Wrap sklearn's _wrap_method_output function. - - Custom implementation to avoid errors for transformers that allow - only providing `y`. Is used internally by _SetOutputMixin. - - """ - - @wraps(f) - def wrapper(self, *args, **kwargs): - try: - return _wrap_method_output(f, method)(self, *args, **kwargs) - except TypeError: - return f(self, *args, **kwargs) - - return wrapper - - def fit_and_score(*args, **kwargs) -> dict[str, Any]: """Wrap sklearn's _fit_and_score function. diff --git a/atom/utils/types.py b/atom/utils/types.py index f9a674aa3..8a86a3067 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -7,16 +7,16 @@ from __future__ import annotations -from collections.abc import Callable, Hashable, Iterable, Iterator +import os +from collections.abc import Callable, Hashable, Iterator +from importlib.util import find_spec from typing import ( TYPE_CHECKING, Annotated, Any, Literal, NamedTuple, SupportsIndex, TypeAlias, TypedDict, TypeVar, overload, runtime_checkable, ) -import modin.pandas as md import numpy as np import pandas as pd -import scipy.sparse as sps from beartype.door import is_bearable from beartype.typing import Protocol from beartype.vale import Is @@ -25,7 +25,12 @@ if TYPE_CHECKING: - from atom.utils.utils import ClassMap, Goal + from atom.data.dataengines import DataEngine + from atom.utils.utils import Goal + + +# Avoid warning about pyarrow timezones not set +os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" # Classes for type hinting ========================================= >> @@ -117,6 +122,13 @@ def __repr__(self) -> str: """Print representation as dictionary.""" return self._asdict().__repr__() + @property + def data_engine(self) -> DataEngine: + """Return the data engine.""" + from atom.data import DATA_ENGINES + + return DATA_ENGINES[self.data]() + class SPTuple(NamedTuple): """Return type of the `sp` parameter.""" @@ -126,6 +138,28 @@ class SPTuple(NamedTuple): trend_model: SeasonalityModels = "additive" +@runtime_checkable +class SparseMatrix(Protocol): + """Protocol for sparse matrices. + + Required since scipy doesn't have stubs. + + """ + + def __len__(self) -> int: ... + def __iter__(self) -> Iterator: ... + def _bsr_container(self): ... + def _coo_container(self): ... + def _csc_container(self): ... + def _csr_container(self): ... + def _dia_container(self): ... + def _dok_container(self): ... + def _lil_container(self): ... + + @property + def shape(self) -> tuple[int, int]: ... + + @runtime_checkable class SkScorer(Protocol): """Protocol for sklearn's scorers.""" @@ -177,7 +211,6 @@ class Model(Protocol): """Protocol for all models.""" _goal: Goal - _metric: ClassMap _ht: dict[str, Any] def predict(self, *args, **kwargs) -> Pandas: ... @@ -190,11 +223,8 @@ def predict(self, *args, **kwargs) -> Pandas: ... Int: TypeAlias = int | np.integer Float: TypeAlias = float | np.floating Scalar: TypeAlias = Int | Float -Segment: TypeAlias = range | slice -Index: TypeAlias = pd.Index | md.Index -Series: TypeAlias = pd.Series | md.Series -DataFrame: TypeAlias = pd.DataFrame | md.DataFrame -Pandas: TypeAlias = Series | DataFrame +Segment: TypeAlias = slice | range +Pandas: TypeAlias = pd.Series | pd.DataFrame # Numerical types IntLargerZero: TypeAlias = Annotated[Int, Is[lambda x: x > 0]] @@ -210,23 +240,19 @@ def predict(self, *args, **kwargs) -> Pandas: ... # Types for X, y and fh XConstructor: TypeAlias = ( dict[str, Sequence[Any]] - | Sequence[Sequence[Any]] - | Iterable[Sequence[Any] | tuple[Hashable, Sequence[Any]] | dict[str, Sequence[Any]]] + | Sequence[Sequence[Any] | tuple[Hashable, Sequence[Any]]] | np.ndarray - | sps.spmatrix - | DataFrame + | SparseMatrix + | pd.Series + | pd.DataFrame ) XSelector: TypeAlias = XConstructor | Callable[..., XConstructor] -YConstructor: TypeAlias = dict[str, Any] | Sequence[Any] | XConstructor +YConstructor: TypeAlias = Sequence[Any] | XConstructor YSelector: TypeAlias = Int | str | YConstructor FHConstructor: TypeAlias = Int | Sequence[Int] | ForecastingHorizon -# Return types for transform methods -TReturn: TypeAlias = np.ndarray | sps.spmatrix | Series | DataFrame -TReturns: TypeAlias = TReturn | tuple[TReturn, TReturn] - # Selection of rows or columns by name or position -ColumnSelector: TypeAlias = Int | str | Segment | Sequence[Int | str] | DataFrame +ColumnSelector: TypeAlias = Int | str | Segment | Sequence[Int | str] | pd.DataFrame RowSelector: TypeAlias = Hashable | Sequence[Hashable] | ColumnSelector # Assignment of index or stratify parameter @@ -248,10 +274,21 @@ def predict(self, *args, **kwargs) -> Pandas: ... # BaseTransformer parameters NJobs: TypeAlias = Annotated[Int, Is[lambda x: x != 0]] -EngineDataOptions: TypeAlias = Literal["pandas", "pyarrow", "modin"] +EngineDataOptions: TypeAlias = Literal[ + "numpy", + "pandas", + "pandas-pyarrow", + "polars", + "polars-lazy", + "pyarrow", + "modin", + "dask", + "pyspark", + "pyspark-pandas", +] EngineEstimatorOptions: TypeAlias = Literal["sklearn", "sklearnex", "cuml"] Engine: TypeAlias = EngineDataOptions | EngineEstimatorOptions | EngineDict | EngineTuple | None -Backend: TypeAlias = Literal["loky", "multiprocessing", "threading", "ray"] +Backend: TypeAlias = Literal["loky", "multiprocessing", "threading", "ray", "dask"] Warnings: TypeAlias = Literal["default", "error", "ignore", "always", "module", "once"] Severity: TypeAlias = Literal["debug", "info", "warning", "error", "critical"] Verbose: TypeAlias = Literal[0, 1, 2] @@ -299,7 +336,11 @@ def predict(self, *args, **kwargs) -> Pandas: ... # Allowed values for method selection PredictionMethods: TypeAlias = Literal[ - "decision_function", "predict", "predict_log_proba", "predict_proba", "score" + "decision_function", + "predict", + "predict_log_proba", + "predict_proba", + "score", ] PredictionMethodsTS: TypeAlias = Literal[ "predict", @@ -331,6 +372,17 @@ def predict(self, *args, **kwargs) -> Pandas: ... ] # Others +XDatasets: TypeAlias = Literal[ + "dataset", + "train", + "test", + "holdout", + "X", + "X_train", + "X_test", + "X_holdout", +] +YDatasets: TypeAlias = Literal["y", "y_train", "y_test", "y_holdout"] Seasonality: TypeAlias = IntLargerOne | str | Sequence[IntLargerOne | str] | None SeasonalityModels: TypeAlias = Literal["additive", "multiplicative"] FeatureNamesOut: TypeAlias = ( @@ -360,6 +412,71 @@ def predict(self, *args, **kwargs) -> Pandas: ... | Sequence[IntLargerEqualZero] ) +# Return types for transform methods +if TYPE_CHECKING: + import dask.dataframe as dd + import modin.pandas as md + import polars as pl + import pyarrow as pa + import pyspark.pandas as ps + from pyspark.sql import DataFrame as SparkDataFrame + + XReturn: TypeAlias = ( + np.ndarray + | pd.DataFrame + | pl.DataFrame + | pl.LazyFrame + | pa.Table + | md.DataFrame + | dd.DataFrame + | SparkDataFrame + ) + YReturn: TypeAlias = ( + np.ndarray + | pd.Series + | pl.Series + | pa.Array + | md.Series + | dd.Series + | ps.Series + ) +else: + XReturn: TypeAlias = Sequence[Sequence[Any]] | np.ndarray | SparseMatrix | pd.DataFrame + YReturn: TypeAlias = Sequence[Any] | np.ndarray | pd.Series + + if find_spec("polars"): + import polars as pl + + XReturn = XReturn | pl.DataFrame | pl.LazyFrame + YReturn = YReturn | pl.Series + + if find_spec("pyarrow"): + import pyarrow as pa + + XReturn = XReturn | pa.Table + YReturn = YReturn | pa.Array + + if find_spec("modin"): + import modin.pandas as md + + XReturn = XReturn | md.DataFrame + YReturn = YReturn | md.Series + + if find_spec("dask"): + import dask.dataframe as dd + + XReturn = XReturn | dd.DataFrame + YReturn = YReturn | dd.Series + + if find_spec("pyspark"): + import pyspark.pandas as ps + from pyspark.sql import DataFrame as SparkDataFrame + + XReturn = XReturn | SparkDataFrame | ps.DataFrame + YReturn = YReturn | SparkDataFrame | ps.Series + + YReturn = YReturn | XReturn + # Variable types for isinstance ================================== >> @@ -370,8 +487,5 @@ def predict(self, *args, **kwargs) -> Pandas: ... int_t = (int, np.integer) float_t = (float, np.floating) segment_t = (slice, range) -index_t = (pd.Index, md.Index) -series_t = (pd.Series, md.Series) -sequence_t = (range, list, tuple, np.ndarray, *index_t, *series_t) -dataframe_t = (pd.DataFrame, md.DataFrame) -pandas_t = (*series_t, *dataframe_t) +sequence_t = (range, list, tuple, np.ndarray, pd.Index, pd.Series) +pandas_t = (pd.Series, pd.DataFrame) diff --git a/atom/utils/utils.py b/atom/utils/utils.py index 364193aa1..10354963c 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -8,11 +8,10 @@ from __future__ import annotations import functools -import os import sys import warnings from collections import deque -from collections.abc import Callable, Hashable, Iterator +from collections.abc import Callable, Iterator from contextlib import contextmanager from copy import copy from dataclasses import dataclass @@ -23,26 +22,21 @@ from inspect import Parameter, signature from itertools import cycle from types import GeneratorType, MappingProxyType -from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload +from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload import mlflow -import modin.pandas as md import nltk import numpy as np import pandas as pd import plotly.graph_objects as go import scipy.sparse as sps -from beartype import beartype from beartype.door import is_bearable from IPython.display import display from matplotlib.colors import to_rgba -from mlflow.models.signature import infer_signature -from optuna.study import Study -from optuna.trial import FrozenTrial from pandas._libs.missing import NAType -from pandas._typing import Axes, Dtype, DtypeArg +from pandas._typing import Axes, Dtype from pandas.api.types import is_numeric_dtype -from shap import Explainer, Explanation +from shap import Explainer from sklearn.base import BaseEstimator from sklearn.base import OneToOneFeatureMixin as FMixin from sklearn.metrics import ( @@ -52,31 +46,34 @@ from sklearn.utils import _print_elapsed_time from sklearn.utils.validation import _is_fitted -from atom.utils.constants import __version__ +from atom.utils.constants import CAT_TYPES, __version__ from atom.utils.types import ( - Bool, DataFrame, Estimator, FeatureNamesOut, Float, Index, IndexSelector, - Int, IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar, - Scorer, Segment, Sequence, Series, SPTuple, Transformer, TReturn, TReturns, - Verbose, XConstructor, XSelector, YConstructor, YSelector, dataframe_t, - int_t, pandas_t, segment_t, sequence_t, series_t, + Bool, EngineDataOptions, EngineTuple, Estimator, FeatureNamesOut, Float, + IndexSelector, Int, IntLargerEqualZero, MetricFunction, Model, Pandas, + Predictor, Scalar, Scorer, Segment, Sequence, SPTuple, Transformer, + Verbose, XConstructor, XReturn, YConstructor, YReturn, int_t, segment_t, + sequence_t, ) if TYPE_CHECKING: + from optuna.study import Study + from optuna.trial import FrozenTrial + from shap import Explanation + from atom.basemodel import BaseModel from atom.baserunner import BaseRunner - from atom.branch import Branch + from atom.data import Branch T = TypeVar("T") -T_Pandas = TypeVar("T_Pandas", Series, DataFrame) +T_Pandas = TypeVar("T_Pandas", pd.Series, pd.DataFrame, pd.Series | pd.DataFrame) T_Transformer = TypeVar("T_Transformer", bound=Transformer) T_Estimator = TypeVar("T_Estimator", bound=Estimator) # Classes ========================================================== >> - class NotFittedError(ValueError, AttributeError): """Exception called when the instance is not yet fitted. @@ -98,7 +95,7 @@ def infer_task(self, y: Pandas) -> Task: Parameters ---------- - y: series or dataframe + y: pd.Series or pd.DataFrame Target column(s). Returns @@ -108,17 +105,17 @@ def infer_task(self, y: Pandas) -> Task: """ if self.value == 1: - if isinstance(y, series_t): + if isinstance(y, pd.Series): return Task.regression else: return Task.multioutput_regression elif self.value == 2: - if isinstance(y, series_t): + if isinstance(y, pd.Series): return Task.univariate_forecast else: return Task.multivariate_forecast - if isinstance(y, dataframe_t): + if isinstance(y, pd.DataFrame): if all(y[col].nunique() == 2 for col in y.columns): return Task.multilabel_classification else: @@ -207,10 +204,10 @@ class SeasonalPeriod(IntEnum): class DataContainer: """Stores a branch's data.""" - data: DataFrame # Complete dataset - train_idx: Index # Indices in the train set - test_idx: Index # Indices in the test - n_cols: Int # Number of target columns + data: pd.DataFrame # Complete dataset + train_idx: pd.Index # Indices in the train set + test_idx: pd.Index # Indices in the test + n_targets: int # Number of target columns @dataclass @@ -245,7 +242,7 @@ class DataConfig: """ - index: bool = True + index: bool = False ignore: tuple[str, ...] = () sp: SPTuple = SPTuple() # noqa: RUF009 shuffle: Bool = False @@ -254,20 +251,20 @@ class DataConfig: test_size: Scalar = 0.2 holdout_size: Scalar | None = None - def get_stratify_columns(self, df: DataFrame, y: Pandas) -> DataFrame | None: + def get_stratify_columns(self, df: pd.DataFrame, y: Pandas) -> pd.DataFrame | None: """Get columns to stratify by. Parameters ---------- - df: dataframe + df: pd.DataFrame Dataset from which to get the columns. - y: series or dataframe - Target column. + y: pd.Series or pd.DataFrame + Target column(s). Returns ------- - dataframe or None + pd.DataFrame or None Dataset with subselection of columns. Returns None if there's no stratification. @@ -302,26 +299,6 @@ def get_stratify_columns(self, df: DataFrame, y: Pandas) -> DataFrame | None: return df[inc] -class PandasModin: - """Utility class to select the right data engine. - - Returns pandas or modin depending on the env variable - ATOM_DATA_ENGINE, which is set in BaseTransformer.py. - - """ - - def __getattr__(self, item: str) -> Any: - """Return the backend engine.""" - if os.environ.get("ATOM_DATA_ENGINE") == "modin": - return getattr(md, item) - else: - return getattr(pd, item) - - -# ATOM uses this instance to access the data engine -bk = PandasModin() - - class CatBMetric: """Custom evaluation metric for the CatBoost model. @@ -653,7 +630,8 @@ def __init__(self, model: BaseModel, n_jobs: Int): def __call__(self, study: Study, trial: FrozenTrial): """Print trial info and store in mlflow experiment.""" try: # Fails when there are no successful trials - trial_info = self.T.trials.reset_index(names="trial").loc[trial.number] + trials = self.T.trials.reset_index(names="trial") + trial_info = cast(pd.Series, trials.loc[trial.number]) # Loc returns df or series except KeyError: return @@ -685,7 +663,7 @@ def __call__(self, study: Study, trial: FrozenTrial): mlflow.sklearn.log_model( sk_model=estimator, artifact_path=estimator.__class__.__name__, - signature=infer_signature( + signature=mlflow.models.signature.infer_signature( model_input=pd.DataFrame(self.T.branch.X), model_output=estimator.predict(self.T.branch.X.iloc[[0]]), ), @@ -761,7 +739,7 @@ def __init__(self, name: str, metric: list[str], aesthetics: Aesthetics): self.y1: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))} self.y2: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))} - traces: list[go.Scatter] = [] + traces = [] colors = cycle(aesthetics.palette) for met in metric: color = next(colors) @@ -943,16 +921,16 @@ def explainer(self) -> Explainer: Returns ------- - Explainer + shap.Explainer Get the initialized explainer object. """ - # Pass masker as np.array and feature names separately for modin frames kwargs = { - "masker": self.branch.X_train.to_numpy(), + "masker": self.branch.X_train, "feature_names": list(self.branch.features), "seed": self.random_state, } + try: # Fails when model does not fit standard explainers (e.g., ensembles) return Explainer(self.estimator, **kwargs) except TypeError: @@ -961,7 +939,7 @@ def explainer(self) -> Explainer: def get_explanation( self, - df: DataFrame, + df: pd.DataFrame, target: tuple[Int, ...], ) -> Explanation: """Get an Explanation object. @@ -970,7 +948,7 @@ def get_explanation( Parameters ---------- - df: dataframe + df: pd.DataFrame Data set to look at (subset of the complete dataset). target: tuple @@ -1009,10 +987,10 @@ def get_explanation( ) from None # Remember shap values in the _shap_values attribute - self._shap_values = bk.concat( + self._shap_values = pd.concat( [ self._shap_values, - bk.Series(list(self._explanation.values), index=calculate.index), + pd.Series(list(self._explanation.values), index=calculate.index), ] ) @@ -1330,7 +1308,7 @@ def sign(obj: Callable) -> MappingProxyType: return signature(obj).parameters -def merge(*args) -> DataFrame: +def merge(*args) -> pd.DataFrame: """Concatenate pandas objects column-wise. None and empty objects are ignored. @@ -1342,14 +1320,14 @@ def merge(*args) -> DataFrame: Returns ------- - dataframe + pd.DataFrame Concatenated dataframe. """ if len(args_c := [x for x in args if x is not None and not x.empty]) == 1: - return bk.DataFrame(args_c[0]) + return pd.DataFrame(args_c[0]) else: - return bk.DataFrame(bk.concat(args_c, axis=1)) + return pd.DataFrame(pd.concat(args_c, axis=1)) def replace_missing(X: T_Pandas, missing_values: list[Any] | None = None) -> T_Pandas: @@ -1360,7 +1338,7 @@ def replace_missing(X: T_Pandas, missing_values: list[Any] | None = None) -> T_P Parameters ---------- - X: series or dataframe + X: pd.Series or pd.DataFrame Data set to replace. missing_values: list or None, default=None @@ -1369,7 +1347,7 @@ def replace_missing(X: T_Pandas, missing_values: list[Any] | None = None) -> T_P Returns ------- - series or dataframe + pd.Series or pd.DataFrame Data set without missing values. """ @@ -1393,42 +1371,94 @@ def get_nan(dtype: Dtype) -> float | NAType: # Always convert these values default_values = [None, pd.NA, pd.NaT, np.NaN, np.inf, -np.inf] - if isinstance(X, series_t): + if isinstance(X, pd.DataFrame): return X.replace( - to_replace=(missing_values or []) + default_values, - value=get_nan(X.dtype), + to_replace={c: (missing_values or []) + default_values for c in X.columns}, + value={c: get_nan(d) for c, d in X.dtypes.items()}, ) else: return X.replace( - to_replace={k: (missing_values or []) + default_values for k in X}, - value={k: get_nan(X[k].dtype) for k in X}, + to_replace=(missing_values or []) + default_values, + value=get_nan(X.dtype), ) -def get_cols(elem: Pandas) -> list[Series]: +def n_cols(obj: YConstructor | None) -> int: + """Get the number of columns in a dataset. + + Parameters + ---------- + obj: dict, sequence, dataframe-like or None + Dataset to check. + + Returns + ------- + int + Number of columns. + + """ + if hasattr(obj, "shape"): + return obj.shape[1] if len(obj.shape) > 1 else 1 # type: ignore[union-attr] + elif isinstance(obj, dict): + return 2 # Dict always goes to dataframe + + try: + if (array := np.asarray(obj)).ndim > 1: + return array.shape[1] + else: + return array.ndim + except ValueError: + # Fails for inhomogeneous data, return series + return 1 + + +def get_cols(obj: Pandas) -> list[pd.Series]: """Get a list of columns in dataframe / series. Parameters ---------- - elem: series or dataframe + obj: pd.Series or pd.DataFrame Element to get the columns from. Returns ------- - list of series - Columns in elem. + list of pd.Series + Columns. """ - if isinstance(elem, series_t): - return [elem] + if isinstance(obj, pd.Series): + return [obj] else: - return [elem[col] for col in elem.columns] + return [obj[col] for col in obj.columns] + + +def get_col_names(obj: Any) -> list[str] | None: + """Get a list of column names in tabular objects. + + Parameters + ---------- + obj: object + Element to get the column names from. + + Returns + ------- + list of str + Names of the columns. Returns None when the object passed is + no pandas object. + + """ + if isinstance(obj, pd.DataFrame): + return list(obj.columns) + elif isinstance(obj, pd.Series): + return [str(obj.name)] + else: + return None def variable_return( - X: DataFrame | None, - y: Series | None, -) -> DataFrame | Series | tuple[DataFrame, Pandas]: + X: XReturn | None, + y: YReturn | None, +) -> XReturn | tuple[XReturn, YReturn]: """Return one or two arguments depending on which is None. This utility is used to make methods return only the provided @@ -1440,20 +1470,22 @@ def variable_return( Feature set. y: series, dataframe or None - Target column. + Target column(s). Returns ------- - dataframe, series or tuple + series, dataframe or tuple Data sets that are not None. """ - if y is None: + if y is None and X is not None: return X - elif X is None: + elif X is None and y is not None: return y - else: + elif X is not None and y is not None: return X, y + else: + raise ValueError("Both X and y can't be None.") def get_segment(obj: list[T], segment: Segment) -> list[T]: @@ -1486,7 +1518,7 @@ def is_sparse(obj: Pandas) -> bool: Parameters ---------- - obj: series or dataframe + obj: pd.Series or pd.DataFrame Data set to check. Returns @@ -1498,25 +1530,27 @@ def is_sparse(obj: Pandas) -> bool: return any(isinstance(col.dtype, pd.SparseDtype) for col in get_cols(obj)) -def check_empty(obj: Pandas) -> Pandas | None: +def check_empty(obj: Pandas | None) -> Pandas | None: """Check if a pandas object is empty. Parameters ---------- - obj: series or dataframe + obj: pd.Series, pd.DataFrame or None Pandas object to check. Returns ------- - series, dataframe or None - Same object or None if empty. + pd.Series, pd.DataFrame or None + Same object or None if empty or obj is None. """ - return obj if isinstance(obj, dataframe_t) and not obj.empty else None + return obj if isinstance(obj, pd.DataFrame) and not obj.empty else None def check_dependency(name: str): - """Raise an error if a package is not installed. + """Check an optional dependency. + + Raise an error if the package is not installed. Parameters ---------- @@ -1524,7 +1558,7 @@ def check_dependency(name: str): Name of the package to check. """ - if not find_spec(name.replace("-", "_")): + if not find_spec(name): raise ModuleNotFoundError( f"Unable to import the {name} package. Install it using " f"`pip install {name}` or install all of atom's optional " @@ -1591,50 +1625,38 @@ def check_predict_proba(models: Model | Sequence[Model], method: str): ) -def check_scaling(X: Pandas, pipeline: Any | None = None) -> bool: +def check_scaling(obj: Pandas) -> bool: """Check if the data is scaled. A data set is considered scaled when the mean of the mean of all columns lies between -0.05 and 0.05 and the mean of the standard deviation of all columns lies between 0.85 and 1.15. - Binary columns are excluded from the calculation. - - Additionally, if a pipeline is provided and there's a scaler in - the pipeline, it also returns False. + Categorical and binary columns are excluded from the calculation. Parameters ---------- - X: series or dataframe + obj: pd.Series or pd.DataFrame Data set to check. - pipeline: Pipeline or None, default=None - Pipeline in which to check for a scaler (any estimator whose - name contains the word scaler). - Returns ------- bool Whether the data set is scaled. """ - has_scaler = False - if pipeline is not None: - has_scaler = any("scaler" in name.lower() for name in pipeline.named_steps) - - df = to_df(X) # Convert to dataframe - df = df.loc[:, (~df.isin([0, 1])).any(axis=0)] # Remove binary columns - - if df.empty: # All columns are binary -> no scaling needed - return True + if isinstance(obj, pd.DataFrame): + mean = obj.mean(numeric_only=True).mean() + std = obj.std(numeric_only=True).mean() else: - mean = df.mean(numeric_only=True).mean() - std = df.std(numeric_only=True).mean() - return has_scaler or bool(-0.05 < mean < 0.05 and 0.85 < std < 1.15) + mean = obj.mean() + std = obj.std() + + return bool(-0.05 < mean < 0.05 and 0.85 < std < 1.15) @contextmanager def keep_attrs(estimator: Estimator): - """Contextmanager to save an estimator's custom attributes. + """Temporarily save an estimator's custom attributes. ATOM's pipeline uses two custom attributes for its transformers: _train_only, and _cols. Since some transformers reset their @@ -1654,30 +1676,42 @@ def keep_attrs(estimator: Estimator): @contextmanager -def adjust_verbosity(estimator: Estimator, verbose: Verbose | None): - """Contextmanager to save an estimator's custom attributes. +def adjust( + estimator: Estimator, + *, + transform: EngineDataOptions | None = None, + verbose: Verbose | None = None, +): + """Temporarily adjust output parameters of an estimator. - ATOM's pipeline uses two custom attributes for its transformers: - _train_only, and _cols. Since some transformers reset their - attributes during fit (like those from sktime), we wrap the fit - method in a contextmanager that saves and restores the attrs. + The estimator's data engine and verbosity are temporarily changed + to the provided values. Parameters ---------- estimator: Estimator Temporarily change the verbosity of this estimator. + transform: str or None, default=None + Data engine for the estimator. If None, it leaves it to + its original engine. + verbose: int or None, default=None - Verbosity level of the transformers in the pipeline. If - None, it leaves them to their original verbosity. + Verbosity level for the estimator. If None, it leaves it to + its original verbosity. """ try: + if transform is not None and hasattr(estimator, "set_output"): + output = getattr(estimator, "_engine", EngineTuple()) + estimator.set_output(transform=transform) if verbose is not None and hasattr(estimator, "verbose"): verbosity = estimator.verbose estimator.verbose = verbose yield estimator finally: + if transform is not None and hasattr(estimator, "set_output"): + estimator._engine = output if verbose is not None and hasattr(estimator, "verbose"): estimator.verbose = verbosity @@ -1704,7 +1738,7 @@ def get_versions(models: ClassMap) -> dict[str, str]: return versions -def get_corpus(df: DataFrame) -> str: +def get_corpus(df: pd.DataFrame) -> str: """Get text column from a dataframe. The text column should be called `corpus` (case-insensitive). Also @@ -1712,7 +1746,7 @@ def get_corpus(df: DataFrame) -> str: Parameters ---------- - df: dataframe + df: pd.DataFrame Data set from which to get the corpus. Returns @@ -1760,164 +1794,125 @@ def time_to_str(t: Scalar) -> str: return f"{h:02.0f}h:{m:02.0f}m:{s:02.0f}s" -def n_cols(data: XSelector | YSelector) -> int: - """Get the number of columns in a dataset. - - Parameters - ---------- - data: sequence or dataframe-like - Dataset to check. - - Returns - ------- - int or None - Number of columns. - - """ - if (array := np.array(data, dtype="object")).ndim > 1: - return array.shape[1] - else: - return array.ndim # Can be zero when input is a dict - - -def to_pyarrow(column: Series, *, inverse: bool = False) -> Dtype: - """Get the pyarrow dtype corresponding to a series. - - Parameters - ---------- - column: series - Column to get the dtype from. If it already has a pyarrow - dtype, return the original dtype. - - inverse: bool, default=False - Whether to convert to pyarrow or back from pyarrow. - - Returns - ------- - str - Name of the converted dtype. - - """ - if not inverse and not column.dtype.name.endswith("[pyarrow]"): - if column.dtype.name == "object": - return "string[pyarrow]" # pyarrow doesn't support 'object' - else: - return f"{column.dtype.name}[pyarrow]" - elif inverse and column.dtype.name.endswith("[pyarrow]"): - return column.dtype.name[:-9] - - return column.dtype.name - - @overload def to_df( data: Literal[None], index: Axes | None = ..., columns: Axes | None = ..., - dtype: DtypeArg | None = ..., ) -> None: ... @overload def to_df( - data: XSelector, + data: XConstructor, index: Axes | None = ..., columns: Axes | None = ..., - dtype: DtypeArg | None = ..., -) -> DataFrame: ... +) -> pd.DataFrame: ... def to_df( - data: XSelector | None, + data: XConstructor | None, index: Axes | None = None, columns: Axes | None = None, - dtype: DtypeArg | None = None, -) -> DataFrame | None: - """Convert a dataset to a dataframe. +) -> pd.DataFrame | None: + """Convert a dataset to a pandas dataframe. Parameters ---------- data: dataframe-like or None - Dataset to convert to a dataframe. If None or already a - dataframe, return unchanged. + Dataset to convert to a dataframe. If None or already a + pandas dataframe, return unchanged. - index: sequence, index or None, default=None + index: sequence or None, default=None Values for the index. columns: sequence or None, default=None - Name of the columns. Use None for automatic naming. - - dtype: str, dict, np.dtype or None, default=None - Data types for the output columns. If None, the types are - inferred from the data. + Names of the columns. Use None for automatic naming. Returns ------- - dataframe or None - Dataset as dataframe of a type given by the backend. + pd.DataFrame or None + Data as dataframe. Returns None if data is None. """ if data is not None: - if not isinstance(data, bk.DataFrame): - # Assign default column names (dict already has column names) - if not isinstance(data, dict | Pandas) and columns is None: + if isinstance(data, pd.DataFrame): + data_c = data.copy() + elif hasattr(data, "to_pandas"): + data_c = data.to_pandas() + elif hasattr(data, "__dataframe__"): + # Transform from dataframe interchange protocol + data_c = pd.api.interchange.from_dataframe(data.__dataframe__()) + else: + # Assign default column names (dict and series already have names) + if columns is None and not isinstance(data, dict | pd.Series): columns = [f"x{i}" for i in range(n_cols(data))] - if hasattr(data, "to_pandas") and bk.__name__ == "pandas": - # Convert cuML to pandas - data_c = data.to_pandas() # type: ignore[operator] - elif sps.issparse(data): - data_c = pd.DataFrame.sparse.from_spmatrix( - data=data, + if sps.issparse(data): + data_c = pd.DataFrame.sparse.from_spmatrix(data, index, columns) + else: + data_c = pd.DataFrame( + data=data, # type: ignore[misc, arg-type] index=index, columns=columns, + copy=True, ) - else: - data_c = pd.DataFrame(data, index, columns) # type: ignore[arg-type, misc] + + # If text dataset, change the name of the column to corpus + if list(data_c.columns) == ["x0"] and data_c.dtypes[0].name in CAT_TYPES: + data_c = data_c.rename(columns={data_c.columns[0]: "corpus"}) else: - data_c = data + # Convert all column names to str + data_c.columns = data_c.columns.astype(str) - if dtype is not None: - data_c = data_c.astype(dtype) + # No duplicate rows nor column names are allowed + if data_c.columns.duplicated().any(): + raise ValueError("Duplicate column names found in X.") - if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow": - data_c = data_c.astype({n: to_pyarrow(col) for n, col in data_c.items()}) + if columns is not None: + # Reorder columns to the provided order + try: + data_c = data_c[list(columns)] # Force order determined by columns + except KeyError: + raise ValueError( + f"The columns are different than seen at fit time. Features " + f"{set(data_c.columns) - set(columns)} " # type: ignore[arg-type] + "are missing in X." + ) from None return data_c - - return data + else: + return None @overload def to_series( data: Literal[None], index: Axes | None = ..., - name: Hashable | None = ..., - dtype: Dtype | None = ..., + name: str | None = ..., ) -> None: ... @overload def to_series( - data: dict[str, Any] | Sequence[Any], + data: dict[str, Any] | Sequence[Any] | pd.DataFrame, index: Axes | None = ..., - name: Hashable | None = ..., - dtype: Dtype | None = ..., -) -> Series: ... + name: str | None = ..., +) -> pd.Series: ... def to_series( - data: dict[str, Any] | Sequence[Any] | None, + data: dict[str, Any] | Sequence[Any] | pd.DataFrame | None, index: Axes | None = None, - name: Hashable | None = None, - dtype: Dtype | None = None, -) -> Series | None: - """Convert a sequence to a series. + name: str | None = None, +) -> pd.Series | None: + """Convert a sequence to a pandas series. Parameters ---------- - data: dict, sequence or None - Data to convert. If None, return unchanged. + data: dict, sequence, pd.DataFrame or None + Data to convert. If None or already a pandas series, return + unchanged. index: sequence, index or None, default=None Values for the index. @@ -1925,99 +1920,84 @@ def to_series( name: str or None, default=None Name of the series. - dtype: str, np.dtype or None, default=None - Data type for the output series. If None, the type is - inferred from the data. - Returns ------- - series or None - Sequence as series of a type given by the backend. + pd.Series or None + Data as series. Returns None if data is None. """ if data is not None: - if not isinstance(data, bk.Series): - if hasattr(data, "to_pandas") and bk.__name__ == "pandas": - data_c = data.to_pandas() # Convert cuML to pandas - else: - # Flatten for arrays with shape (n_samples, 1), sometimes returned by cuML - data_c = pd.Series( # type: ignore[misc] - data=np.array(data, dtype="object").ravel().tolist(), - index=index, - name=getattr(data, "name", name), - dtype=dtype, # type: ignore[arg-type] - ) + if isinstance(data, pd.Series): + data_c = data.copy() + elif isinstance(data, pd.DataFrame): + data_c = data.iloc[:, 0].copy() + elif hasattr(data, "to_pandas"): + data_c = data.to_pandas() else: - data_c = data + try: + # Flatten for arrays with shape=(n_samples, 1) + array = np.asarray(data).ravel().tolist() + except ValueError: + # Fails for inhomogeneous data + array = data - if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow": - data_c = data_c.astype(to_pyarrow(data_c)) + data_c = pd.Series(array, index=index, name=name or "target", copy=True) return data_c - - return data + else: + return None @overload -def to_pandas( +def to_tabular( data: Literal[None], index: Axes | None = ..., - columns: Axes | None = ..., - name: str | None = ..., - dtype: DtypeArg | None = ..., + columns: str | Axes | None = ..., ) -> None: ... @overload -def to_pandas( +def to_tabular( data: YConstructor, index: Axes | None = ..., - columns: Axes | None = ..., - name: str | None = ..., - dtype: DtypeArg | None = ..., + columns: str | Axes | None = ..., ) -> Pandas: ... -def to_pandas( +def to_tabular( data: YConstructor | None, index: Axes | None = None, - columns: Axes | None = None, - name: str | None = None, - dtype: DtypeArg | None = None, + columns: str | Axes | None = None, ) -> Pandas | None: - """Convert a sequence or dataset to a dataframe or series object. + """Convert to a tabular pandas type. If the data is one-dimensional, convert to series, else to a dataframe. Parameters ---------- - data: dict, sequence, dataframe or None + data: dict, sequence, pd.DataFrame or None Data to convert. If None, return unchanged. index: sequence, index or None, default=None Values for the index. - columns: sequence or None, default=None + columns: str, sequence or None, default=None Name of the columns. Use None for automatic naming. - name: str or None, default=None - Name of the series. - - dtype: str, dict, np.dtype or None, default=None - Data type for the output series. If None, the type is - inferred from the data. - Returns ------- - series, dataframe or None - Data as a Pandas object. + pd.Series, pd.DataFrame or None + Data as a pandas object. """ - if n_cols(data) == 1: - return to_series(data, index=index, name=name, dtype=dtype) # type: ignore[misc, arg-type] + if (n_targets := n_cols(data)) == 1: + return to_series(data, index=index, name=flt(columns)) # type: ignore[misc, arg-type] else: - return to_df(data, index=index, columns=columns, dtype=dtype) + if columns is None and not hasattr(data, "__dataframe__"): + columns = [f"y{i}" for i in range(n_targets)] + + return to_df(data, index=index, columns=columns) # type: ignore[misc, arg-type] def check_is_fitted( @@ -2053,26 +2033,6 @@ def check_is_fitted( Whether the estimator is fitted. """ - - def check_attr(attr: str) -> bool: - """Return whether an attribute is False or empty. - - Parameters - ---------- - attr: str - Name of the attribute to check. - - Returns - ------- - bool - Whether the attribute's value is False or empty. - - """ - if isinstance(value := getattr(obj, attr), pandas_t): - return value.empty - else: - return not value - if hasattr(obj, "_is_fitted"): is_fitted = obj._is_fitted else: @@ -2182,10 +2142,10 @@ def get_custom_scorer(metric: str | MetricFunction | Scorer) -> Scorer: # Pipeline functions =============================================== >> def name_cols( - array: TReturn, - original_df: DataFrame, + df: pd.DataFrame, + original_df: pd.DataFrame, col_names: list[str], -) -> list[str]: +) -> pd.Index: """Get the column names after a transformation. If the number of columns is unchanged, the original @@ -2194,10 +2154,10 @@ def name_cols( Parameters ---------- - array: np.ndarray, sps.matrix, series or dataframe + df: pd.DataFrame Transformed dataset. - original_df: dataframe + original_df: pd.DataFrame Original dataset. col_names: list of str @@ -2205,24 +2165,24 @@ def name_cols( Returns ------- - list of str + pd.Index Column names. """ # If columns were only transformed, return og names - if array.shape[1] == len(col_names): - return col_names + if df.shape[1] == len(col_names): + return pd.Index(col_names) # If columns were added or removed temp_cols = [] - for i, col in enumerate(array.T): + for i, column in enumerate(get_cols(df)): # equal_nan=True fails for non-numeric dtypes - mask = original_df.apply( + mask = original_df.apply( # type: ignore[type-var] lambda c: np.array_equal( a1=c, - a2=col, - equal_nan=is_numeric_dtype(c) and np.issubdtype(col.dtype, np.number), - ), + a2=column, + equal_nan=is_numeric_dtype(c) and np.issubdtype(column.dtype.name, np.number), + ) ) if any(mask) and mask[mask].index[0] not in temp_cols: @@ -2239,7 +2199,7 @@ def name_cols( else: counter += 1 - return temp_cols + return pd.Index(temp_cols) def get_col_order( @@ -2287,10 +2247,10 @@ def get_col_order( def reorder_cols( transformer: Transformer, - df: DataFrame, - original_df: DataFrame, + df: pd.DataFrame, + original_df: pd.DataFrame, col_names: list[str], -) -> DataFrame: +) -> pd.DataFrame: """Reorder the columns to their original order. This function is necessary in case only a subset of the @@ -2302,10 +2262,10 @@ def reorder_cols( transformer: Transformer Instance that transformed `df`. - df: dataframe + df: pd.DataFrame Dataset to reorder. - original_df: dataframe + original_df: pd.DataFrame Original dataset (states the order). col_names: list of str @@ -2313,7 +2273,7 @@ def reorder_cols( Returns ------- - dataframe + pd.DataFrame Dataset with reordered columns. """ @@ -2353,8 +2313,8 @@ def reorder_cols( def fit_one( estimator: Estimator, - X: XConstructor | None = None, - y: YConstructor | None = None, + X: pd.DataFrame | None = None, + y: Pandas | None = None, message: str | None = None, **fit_params, ) -> Estimator: @@ -2365,19 +2325,12 @@ def fit_one( estimator: Estimator Instance to fit. - X: dataframe-like or None, default=None + X: pd.DataFrame or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + y: pd.Series, pd.DataFrame or None, default=None + Target column(s) corresponding to `X`. message: str or None Short message. If None, nothing will be printed. @@ -2391,30 +2344,27 @@ def fit_one( Fitted estimator. """ - Xt = to_df(X, index=getattr(y, "index", None)) - yt = to_pandas(y, index=getattr(Xt, "index", None)) - with _print_elapsed_time("Pipeline", message): if hasattr(estimator, "fit"): - kwargs = {} - inc = getattr(estimator, "_cols", getattr(Xt, "columns", [])) + kwargs: dict[str, Pandas] = {} + inc = getattr(estimator, "_cols", getattr(X, "columns", [])) if "X" in (params := sign(estimator.fit)): - if Xt is not None and (cols := [c for c in inc if c in Xt]): - kwargs["X"] = Xt[cols] + if X is not None and (cols := [c for c in inc if c in X]): + kwargs["X"] = X[cols] # X is required but has not been provided if len(kwargs) == 0: - if yt is not None and hasattr(estimator, "_cols"): - kwargs["X"] = to_df(yt)[inc] + if y is not None and hasattr(estimator, "_cols"): + kwargs["X"] = to_df(y)[inc] elif params["X"].default != Parameter.empty: kwargs["X"] = params["X"].default # Fill X with default - elif Xt is None: + elif X is None: raise ValueError( "Exception while trying to fit transformer " f"{estimator.__class__.__name__}. Parameter " "X is required but has not been provided." ) - elif Xt.empty: + elif X.empty: raise ValueError( "Exception while trying to fit transformer " f"{estimator.__class__.__name__}. Parameter X is " @@ -2423,8 +2373,8 @@ def fit_one( "target column, e.g., atom.decompose(columns=-1)." ) - if "y" in params and yt is not None: - kwargs["y"] = yt + if "y" in params and y is not None: + kwargs["y"] = y # Keep custom attrs since some transformers reset during fit with keep_attrs(estimator): @@ -2435,11 +2385,11 @@ def fit_one( def transform_one( transformer: Transformer, - X: XConstructor | None = None, - y: YConstructor | None = None, + X: pd.DataFrame | None = None, + y: Pandas | None = None, method: Literal["transform", "inverse_transform"] = "transform", **transform_params, -) -> tuple[DataFrame | None, Pandas | None]: +) -> tuple[pd.DataFrame | None, Pandas | None]: """Transform the data using one estimator. Parameters @@ -2447,19 +2397,12 @@ def transform_one( transformer: Transformer Instance to fit. - X: dataframe-like or None, default=None + X: pd.DataFrame or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. - - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + `X` is ignored. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + y: pd.Series, pd.DataFrame or None, default=None + Target column(s) corresponding to `X`. method: str, default="transform" Method to apply: transform or inverse_transform. @@ -2469,127 +2412,103 @@ def transform_one( Returns ------- - dataframe or None + pd.DataFrame or None Feature set. Returns None if not provided. - series, dataframe or None - Target column. Returns None if not provided. + pd.Series, pd.DataFrame or None + Target column(s). Returns None if not provided. """ - def prepare_df(out: TReturn, og: DataFrame) -> DataFrame: - """Convert to df and set correct column names and order. - - If ATOM's data backend="pyarrow", convert the dtypes. + def prepare_df(out: XConstructor, og: pd.DataFrame) -> pd.DataFrame: + """Convert to df and set the correct column names. Parameters ---------- - out: np.ndarray, sps.matrix, series or dataframe + out: dataframe-like Data returned by the transformation. - og: dataframe + og: pd.DataFrame Original dataframe, prior to transformations. Returns ------- - dataframe + pd.DataFrame Transformed dataset. """ - use_cols = [c for c in inc if c in og.columns] + out_c = to_df(out, index=og.index) - # Convert to pandas and assign proper column names - if not isinstance(out, dataframe_t): + # Assign proper column names + use_cols = [c for c in inc if c in og.columns] + if not isinstance(out, pd.DataFrame): if hasattr(transformer, "get_feature_names_out"): - columns = transformer.get_feature_names_out() + out_c.columns = transformer.get_feature_names_out() else: - columns = name_cols(out, og, use_cols) - else: - columns = out.columns - - out = to_df(out, index=og.index, columns=columns) + out_c.columns = name_cols(out_c, og, use_cols) # Reorder columns if only a subset was used if len(use_cols) != og.shape[1]: - return reorder_cols(transformer, out, og, use_cols) + return reorder_cols(transformer, out_c, og, use_cols) else: - return out - - Xt = to_df( - data=X, - index=getattr(y, "index", None), - columns=getattr(transformer, "feature_names_in_", None), - ) - yt = to_pandas( - y, - index=getattr(Xt, "index", None), - columns=getattr(transformer, "target_names_in_", None), - name=flt(getattr(transformer, "target_names_in_", None)), - ) + return out_c use_y = True kwargs: dict[str, Any] = {} - inc = list(getattr(transformer, "_cols", getattr(Xt, "columns", []))) + inc = list(getattr(transformer, "_cols", getattr(X, "columns", []))) if "X" in (params := sign(getattr(transformer, method))): - if Xt is not None and (cols := [c for c in inc if c in Xt]): - kwargs["X"] = Xt[cols] + if X is not None and (cols := [c for c in inc if c in X]): + kwargs["X"] = X[cols] # X is required but has not been provided if len(kwargs) == 0: - if yt is not None and hasattr(transformer, "_cols"): - kwargs["X"] = to_df(yt)[inc] + if y is not None and hasattr(transformer, "_cols"): + kwargs["X"] = to_df(y)[inc] use_y = False elif params["X"].default != Parameter.empty: kwargs["X"] = params["X"].default # Fill X with default else: - return Xt, yt # If X is needed, skip the transformer + return X, y # If X is needed, skip the transformer if "y" in params: # We skip `y` when already added to `X` - if yt is not None and use_y: - kwargs["y"] = yt + if y is not None and use_y: + kwargs["y"] = y elif "X" not in params: - return Xt, yt # If y is None and no X in transformer, skip the transformer + return X, y # If y is None and no X in transformer, skip the transformer - out: TReturns = getattr(transformer, method)(**kwargs, **transform_params) + caller = getattr(transformer, method) + out: YConstructor | tuple[XConstructor, YConstructor] = caller(**kwargs, **transform_params) # Transform can return X, y or both - if isinstance(out, tuple): - X_new = prepare_df(out[0], Xt) - y_new = to_pandas( - data=out[1], - index=Xt.index, - name=getattr(yt, "name", None), - columns=getattr(yt, "columns", None), - ) - if isinstance(yt, dataframe_t): - y_new = prepare_df(y_new, yt) - elif "X" in params and X is not None and any(c in Xt for c in inc): + X_new: pd.DataFrame | None + y_new: Pandas | None + if isinstance(out, tuple) and X is not None: + X_new = prepare_df(out[0], X) + y_new = to_tabular(out[1], index=X_new.index) + if isinstance(y, pd.DataFrame) and isinstance(y_new, pd.DataFrame): + y_new = prepare_df(y_new, y) + elif "X" in params and X is not None and any(c in X for c in inc): # X in -> X out - X_new = prepare_df(out, Xt) - y_new = yt if yt is None else yt.set_axis(X_new.index, axis=0) + X_new = prepare_df(out, X) # type: ignore[arg-type] + y_new = y if y is None else y.set_axis(X_new.index, axis=0) elif y is not None: - y_new = to_pandas( - data=out, - index=yt.index, - name=getattr(yt, "name", None), - columns=getattr(yt, "columns", None), - ) - X_new = Xt if Xt is None else Xt.set_index(y_new.index) - if isinstance(yt, dataframe_t): - y_new = prepare_df(y_new, yt) + y_new = to_tabular(out) + X_new = X if X is None else X.set_index(y_new.index) + if isinstance(y, pd.DataFrame) and isinstance(y_new, pd.DataFrame): + y_new = prepare_df(y_new, y) return X_new, y_new def fit_transform_one( transformer: Transformer, - X: XConstructor | None, - y: YConstructor | None, + X: pd.DataFrame | None, + y: Pandas | None, message: str | None = None, **fit_params, -) -> tuple[DataFrame | None, Series | None, Transformer]: +) -> tuple[pd.DataFrame | None, Pandas | None, Transformer]: """Fit and transform the data using one estimator. Estimators without a `transform` method aren't transformed. @@ -2599,19 +2518,12 @@ def fit_transform_one( transformer: Transformer Instance to fit. - X: dataframe-like or None + X: pd.DataFrame or None Feature set with shape=(n_samples, n_features). If None, - X is ignored. - - y: int, str, dict, sequence, dataframe or None - Target column corresponding to `X`. + `X` is ignored. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + y: pd.Series, pd.DataFrame or None + Target column(s) corresponding to `X`. message: str or None, default=None Short message. If None, nothing will be printed. @@ -2621,20 +2533,20 @@ def fit_transform_one( Returns ------- - dataframe or None + pd.DataFrame or None Feature set. Returns None if not provided. - series or None - Target column. Returns None if not provided. + pd.Series, pd.DataFrame or None + Target column(s). Returns None if not provided. Transformer Fitted transformer. """ fit_one(transformer, X, y, message, **fit_params) - X, y = transform_one(transformer, X, y) + Xt, yt = transform_one(transformer, X, y) - return X, y, transformer + return Xt, yt, transformer # Decorators ======================================================= >> @@ -2770,50 +2682,6 @@ def wrapper(*args, **kwargs) -> Any: return wrapper -def wrap_transformer_methods(f: Callable) -> Callable: - """Wrap transformer methods with shared code. - - The following operations are always performed: - - - Transform the input to pandas types. - - Add the `feature_names_in_` and `n_features_in_` attributes. - - Check if the instance is fitted before transforming. - - """ - - @wraps(f) - @beartype - def wrapper( - self: T_Transformer, - X: XSelector | None = None, - y: YSelector | None = None, - **kwargs, - ) -> T_Transformer | Pandas | tuple[DataFrame, Pandas]: - if f.__name__ == "fit": - Xt, yt = self._check_input(X, y) - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - return f(self, Xt, yt, **kwargs) - - else: - if "TransformerMixin" not in str(self.fit): - check_is_fitted(self) - - Xt, yt = self._check_input( - X=X, - y=y, - columns=getattr(self, "feature_names_in_", None), - name=getattr(self, "target_names_in_", None), - ) - - if "y" in sign(f): - return f(self, Xt, yt, **kwargs) - else: - return f(self, Xt, **kwargs) - - return wrapper - - def make_sklearn( obj: T_Estimator, feature_names_out: FeatureNamesOut = "one-to-one", @@ -2881,10 +2749,10 @@ def wrapper(self, *args, **kwargs): return wrapper - if not obj.__module__.startswith(("atom.", "sklearn.", "imblearn.")) and hasattr(obj, "fit"): - if isinstance(obj, type): + if not obj.__module__.startswith(("atom.", "sklearn.", "imblearn.")): + if isinstance(obj, type) and hasattr(obj, "fit"): obj.fit = wrap_fit(obj.fit) - else: + elif hasattr(obj.__class__, "fit"): obj.fit = wrap_fit(obj.__class__.fit).__get__(obj) # type: ignore[method-assign] return obj diff --git a/docs_sources/changelog/v6.x.x.md b/docs_sources/changelog/v6.x.x.md index c967f1aaa..c681b188d 100644 --- a/docs_sources/changelog/v6.x.x.md +++ b/docs_sources/changelog/v6.x.x.md @@ -10,6 +10,7 @@ * Support for [Python 3.11](https://www.python.org/downloads/release/python-3110/) and drop support for [Python 3.8](https://www.python.org/downloads/release/python-380/) and [Python 3.9](ttps://www.python.org/downloads/release/python-390/). * New data engines. Read more in the [user guide][data-acceleration]. +* Added the `dask` [parallelization backend][parallel-execution]. * Improved memory optimizations. Read more in the [user guide][memory-considerations]. * Added the `iterative` strategy for [numerical imputation][imputer]. * Added the `hdbscan` strategy to the [Pruner][] class. diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md index fe360d19a..7dcaa5c61 100644 --- a/docs_sources/dependencies.md +++ b/docs_sources/dependencies.md @@ -26,29 +26,24 @@ packages are necessary for its correct functioning. * **[beartype](https://beartype.readthedocs.io/en/latest/)** (>=0.16.4) * **[category-encoders](https://contrib.scikit-learn.org/categorical-encoding/index.html)** (>=2.6.3) -* **[dagshub](https://github.com/DagsHub/client)** (>=0.3.8) * **[dill](https://pypi.org/project/dill/)** (>=0.3.6) +* **[featuretools](https://www.featuretools.com/)** (>=1.28.0) * **[gplearn](https://gplearn.readthedocs.io/en/stable/index.html)** (>=0.4.2) * **[imbalanced-learn](https://imbalanced-learn.readthedocs.io/en/stable/api.html)** (>=0.12.0) * **[ipython](https://ipython.readthedocs.io/en/stable/)** (>=8.11.0) * **[ipywidgets](https://pypi.org/project/ipywidgets/)** (>=8.1.1) -* **[featuretools](https://www.featuretools.com/)** (>=1.28.0) * **[joblib](https://joblib.readthedocs.io/en/latest/)** (>=1.3.1) * **[matplotlib](https://matplotlib.org/)** (>=3.7.2) -* **[mlflow](https://mlflow.org/)** (>=2.7.1) -* **[modin[ray]](https://modin.readthedocs.io/en/stable/)** (>=0.25.0) +* **[mlflow](https://mlflow.org/)** (>=2.10.2) * **[nltk](https://www.nltk.org/)** (>=3.8.1) * **[numpy](https://numpy.org/)** (>=1.23.0) * **[optuna](https://optuna.org/)** (>=3.4.0) -* **[pandas[parquet]](https://pandas.pydata.org/)** (>=2.1.2) -* **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.9.1) +* **[pandas](https://pandas.pydata.org/)** (>=2.1.2) * **[plotly](https://plotly.com/python/)** (>=5.18.0) -* **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0) * **[scikit-learn](https://scikit-learn.org/stable/)** (>=1.4.0) -* **[scikit-learn-intelex](https://github.com/intel/scikit-learn-intelex)** (>=2023.2.1) * **[scipy](https://www.scipy.org/)** (>=1.10.1) * **[shap](https://github.com/slundberg/shap/)** (>=0.43.0) -* **[sktime[forecasting]](http://www.sktime.net/en/latest/)** (>=0.24.0) +* **[sktime[forecasting]](http://www.sktime.net/en/latest/)** (>=0.26.0) * **[statsmodels](https://www.statsmodels.org/stable/index.html)** (>=0.14.1) * **[zoofs](https://jaswinder9051998.github.io/zoofs/)** (>=0.1.26) @@ -61,9 +56,19 @@ additional libraries. You can install all the optional dependencies using * **[botorch](https://botorch.org/docs/introduction)** (>=0.8.5) * **[catboost](https://catboost.ai/docs/concepts/about.html)** (>=1.2) +* **[dagshub](https://github.com/DagsHub/client)** (>=0.3.8) +* **[dask[distributed]](https://dask.org/)** (>=2024.2.0) * **[explainerdashboard](https://explainerdashboard.readthedocs.io/en/latest/)** (>=0.4.3) * **[gradio](https://github.com/gradio-app/gradio)** (>=3.44.4) * **[lightgbm](https://lightgbm.readthedocs.io/en/latest/)** (>=4.1.0) +* **[modin[ray]](https://modin.readthedocs.io/en/stable/)** (>=0.25.0) +* **[polars](https://pola.rs/)** (>=0.20.7) +* **[pyarrow](https://arrow.apache.org/docs/python/)** (>=15.0.0) +* **[pyspark](https://github.com/apache/spark/tree/master/python)** (>=3.5.0) +* **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0) +* **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.9.1) +* **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0) +* **[scikit-learn-intelex](https://github.com/intel/scikit-learn-intelex)** (>=2023.2.1) * **[schemdraw](https://schemdraw.readthedocs.io/en/latest/index.html)** (>=0.16) * **[statsforecast](https://github.com/Nixtla/statsforecast/)** (>=1.6.0) * **[sweetviz](https://github.com/fbdesignpro/sweetviz)** (>=2.3.1) @@ -86,7 +91,6 @@ running `pdm install --dev` (remember to install [pdm](https://pdm-project.org/l * **[pandas_stubs](https://pypi.org/project/pandas-stubs/)** (>=2.1.1.230928) * **[pre-commit](https://pre-commit.com/)** (>=3.5.0) * **[ruff](https://docs.astral.sh/ruff/)** (>=0.1.7) -* **[types-requests](https://github.com/python/typeshed)** (>=2.31.0.10) **Testing** diff --git a/docs_sources/scripts/autodocs.py b/docs_sources/scripts/autodocs.py index efddad058..5ade3d5cc 100644 --- a/docs_sources/scripts/autodocs.py +++ b/docs_sources/scripts/autodocs.py @@ -85,6 +85,7 @@ votingregressor="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingRegressor.html", ensembleforecaster="https://www.sktime.net/en/latest/api_reference/auto_generated/sktime.forecasting.compose.EnsembleForecaster.html", # Data cleaning + set_output="https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html", clustercentroids="https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.ClusterCentroids.html", onehotencoder="https://contrib.scikit-learn.org/category_encoders/onehot.html", hashingencoder="https://contrib.scikit-learn.org/category_encoders/hashing.html", @@ -926,7 +927,7 @@ def types_conversion(dtype: str) -> str: "Pipeline": "[Pipeline][]", "collections.abc.Hashable": "str", "Scalar": "int | float", - "Pandas": "Series | DataFrame", + "Pandas": "Series | pd.DataFrame", "int | numpy.integer": "int", "float | numpy.floating": "float", "Series | modin.pandas.series.Series": "Series", diff --git a/docs_sources/user_guide/accelerating.md b/docs_sources/user_guide/accelerating.md index 371252f2d..1860f1dc9 100644 --- a/docs_sources/user_guide/accelerating.md +++ b/docs_sources/user_guide/accelerating.md @@ -55,32 +55,48 @@ regardless of the engine parameter. ## Data acceleration +ATOM is mostly built around [sklearn](https://scikit-learn.org/stable/) (and [sktime](https://www.sktime.net/en/stable/) for [time series][] +tasks), and both these libraries use numpy as their computation backend. Since +`atom` relies heavily on column names, it uses pandas (which in turn uses numpy) +as its data backend. However, for the convenience of the user, it implements +several data engines, that wraps the data in a different type when called by the +user. This is very similar to sklearn's [set_output][] behaviour, but ATOM +extends this to many more data types. For example, selecting the `polars` data +engine, makes `atom.dataset` return a polars dataframe and `atom.winner.predict(X)` +return a polars series. + The data engine can be specified through the [`engine`][atomclassifier-engine] -parameter, e.g. `#!python engine="pyarrow"` or -`#!python engine={"data": "pyarrow", "estimator": "sklearnex"}` to combine it -with an [estimator engine][estimator acceleration]. ATOM integrates the following -data engines: - -- **pandas**: This is the default data engine. It uses the [`pandas`](https://pandas.pydata.org/docs/index.html) - library with [`numpy`](https://numpy.org/) as backend. -- **pyarrow**: This engine also uses [`pandas`](https://pandas.pydata.org/docs/user_guide/pyarrow.html), but with the [`pyarrow`](https://arrow.apache.org/docs/python/index.html) - backend, instead of `numpy`. PyArrow is a cross-language, platform-independent, - in-memory data format, that provides an efficient and fast way to serialize and - deserialize data. +parameter, e.g. `#!python engine="pyarrow"` or `#!python engine={"data": "pyarrow", +"estimator": "sklearnex"}` to combine it with an [estimator engine][estimator acceleration]. +ATOM integrates the following data engines: + +- **numpy**: Transform the data to a [`numpy`](https://numpy.org/) array. +- **pandas**: Leave the dataset as a [`pandas`](https://pandas.pydata.org/docs/index.html) object. This is the default + engine, that leaves the data unchanged. +- **pandas-pyarrow**: Transform the data to [`pandas`](https://pandas.pydata.org/docs/user_guide/pyarrow.html) with the [`pyarrow`](https://arrow.apache.org/docs/python/index.html) + backend. Read more in pandas' [user guide](https://pandas.pydata.org/docs/user_guide/pyarrow.html). +- **polars**: The [polars](https://docs.pola.rs/) library is a blazingly fast dataframe library + implemented in Rust and based on Apache Arrow. Transforms the data to a polars + dataframe or series. +- **polars-lazy**: This engine is similar to the `polars` engine, but it returns + a [pl.LazyFrame](https://docs.pola.rs/py-polars/html/reference/lazyframe/index.html) instead of a [pl.pd.DataFrame](https://docs.pola.rs/py-polars/html/reference/dataframe/index.html). +- **pyarrow**: PyArrow is a cross-language, platform-independent, in-memory data + format, that provides an efficient and fast way to serialize and deserialize data. + the data is transformed to a [pa.Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html) or [pa.Array](https://arrow.apache.org/docs/python/generated/pyarrow.Array.html). - **modin**: The [modin](https://modin.readthedocs.io/en/stable/) library is a multi-threading, drop-in replacement - for pandas, that uses [Ray](https://www.ray.io/) as backend. + for pandas, that uses [Ray](https://www.ray.io/) as backend. Transform the data to a modin dataframe + or series. +- **dask**: The [dask](https://docs.dask.org/en/stable/) library is a powerful Python library for parallel and + distributed computing. Transform the data to a [dask dataframe](https://docs.dask.org/en/latest/dataframe.html) or [dask series](https://docs.dask.org/en/stable/generated/dask.dataframe.Series.html). +- **pyspark**: The [pyspark](https://spark.apache.org/docs/latest/api/python/index.html) library is the Python API for Apache Spark. + Transform the data to a [pyspark dataframe](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html) or [pyspark series](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.Column.html). +- **pyspark-pandas**: Similar to the `pyspark` engine, but it returns pyspark objects + with the [pandas API](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html). !!! note - Although atom accepts a numpy array or a list of lists as input, it - converts the data internally to the specified data engine since its API - requires column names and indices. - -!!! warning - Depending on the data engine, the following limitations apply: - - - The `pyarrow` engine doesn't support [sparse datasets][]. - - The [LightGBM][] and [XGBoost][] models don't support the `pyarrow` engine. - - The `modin` engine is not compatible with [forecast][time-series] tasks. + It's important to realize that, within atom, the data is still processed using + pandas (with the numpy backend). Only when the data is returned to the user, it + is transformed to the selected format. ## Estimator acceleration @@ -238,16 +254,18 @@ parallelization backends. mostly useful when the execution bottleneck is a compiled extension that explicitly releases the GIL (for instance a Cython loop wrapped in a "with nogil" block or an expensive call to a library such as numpy). -* **ray:** [Ray](https://www.ray.io/) is an open-source unified compute framework - that makes it easy to scale AI and Python workloads. Read more about Ray [here](https://docs.ray.io/en/latest/ray-core/walkthrough.html). - See [here][example-ray-backend] an example use case. +* **ray:** [Ray](https://www.ray.io/) is an open-source unified compute framework that makes it + easy to scale AI and Python workloads. Read more about Ray [here](https://docs.ray.io/en/latest/ray-core/walkthrough.html). See + [here][example-ray-backend] an example use case. +* **dask:** [Dask](https://docs.dask.org/en/stable/) is a flexible parallel computing library for analytics. + Read more about Dask [here](https://docs.dask.org/en/stable/10-minutes-to-dask.html). The parallelization backend is applied in the following cases: * In every individual estimator that uses parallelization internally. * To calculate cross-validated results during [hyperparameter tuning][]. -* To train multiple models in parallel (when the trainer's `parallel` parameter is True). +* To train multiple models in parallel (when [`parallel=True`][directclassifier-parallel]). * To calculate partial dependencies in [plot_partial_dependence][]. !!! note diff --git a/docs_sources/user_guide/data_management.md b/docs_sources/user_guide/data_management.md index 0e000bdb2..f95fb2bbb 100644 --- a/docs_sources/user_guide/data_management.md +++ b/docs_sources/user_guide/data_management.md @@ -38,8 +38,8 @@ or together: * X * X, y -Remember to use the `y` parameter to indicate the target column in X when -using the first option. If not specified, the last column in X is used as +Remember to use the `y` parameter to indicate the target column in `X` when +using the first option. If not specified, the last column in `X` is used as the target. In both these cases, the sizes of the sets are defined using the `test_size` and `holdout_size` parameters. Note that the splits are made after the subsample of the dataset with the `n_rows` parameter (when not diff --git a/docs_sources/user_guide/nomenclature.md b/docs_sources/user_guide/nomenclature.md index ef758c094..33bb42c2c 100644 --- a/docs_sources/user_guide/nomenclature.md +++ b/docs_sources/user_guide/nomenclature.md @@ -35,22 +35,22 @@ the target column.
dataframe
-Two-dimensional, size-mutable, potentially heterogeneous tabular data of type -[pd.DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) -or its [modin](https://modin.readthedocs.io/en/stable/flow/modin/pandas/dataframe.html) -counterpart. +Two-dimensional, size-mutable, potentially heterogeneous tabular data. +The type is usually [pd.DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), +but could potentially be any of the dataframe types backed by the +selected [data engine][data-acceleration].
dataframe-like
-Any type object from which a [dataframe][] can be created. This includes an -[iterable](https://docs.python.org/3/glossary.html#term-iterable), a -[dict](https://docs.python.org/3/library/functions.html#func-dict) whose +Any type object from which a [pd.DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) +can be created. This includes an [iterable](https://docs.python.org/3/glossary.html#term-iterable), +a [dict](https://docs.python.org/3/library/functions.html#func-dict) whose values are 1d-arrays, a two-dimensional [list](https://docs.python.org/3/library/functions.html#func-list), [tuple](https://docs.python.org/3/library/functions.html#func-tuple), [np.ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) or [sps.csr_matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html), -and most commonly, a [dataframe][]. This is the standard input format for -any dataset. +or any object that follows the [dataframe interchange protocol](https://data-apis.org/dataframe-protocol/latest/index.html). +This is the standard input format for any dataset. Additionally, you can provide a callable whose output is any of the aforementioned types. This is useful when the dataset is very large and @@ -67,13 +67,6 @@ method. Often used interchangeably with [predictor][] because of user preference.
-
index
-
-Immutable sequence used for indexing and alignment of type [pd.Index](https://pandas.pydata.org/docs/reference/api/pandas.Index.html) -or their [modin](https://modin.readthedocs.io/en/stable/flow/modin/pandas/dataframe.html) -counterpart. -
-
missing values
All values in the [`missing`][atomclassifier-missing] attribute, as @@ -129,10 +122,10 @@ column.
series
-One-dimensional ndarray with axis labels of type -[pd.Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html#pandas.Series) -or its [modin](https://modin.readthedocs.io/en/stable/flow/modin/pandas/series.html) -counterpart. +One-dimensional ndarray with axis labels. The type is usually +[pd.Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html#pandas.Series), +but could potentially be any of the series types backed by the +selected [data engine][data-acceleration].
target
diff --git a/pyproject.toml b/pyproject.toml index aa32035f3..981280631 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,29 +21,24 @@ classifiers = [ dependencies = [ "beartype>=0.16.4", "category-encoders>=2.6.3", - "dagshub>=0.3.8", "dill>=0.3.6", + "featuretools>=1.28.0", "gplearn>=0.4.2", "imbalanced-learn>=0.12.0", "ipython>=8.11.0", "ipywidgets>=8.1.1", - "featuretools>=1.28.0", "joblib>=1.3.1", "matplotlib>=3.7.2", - "mlflow>=2.7.1", - "modin[ray]>=0.25.0", + "mlflow>=2.10.2", "nltk>=3.8.1", "numpy>=1.23.0", "optuna>=3.4.0", - "pandas[parquet]>=2.1.2", + "pandas>=2.1.2", "plotly>=5.18.0", - "ray[serve]>=2.9.1", - "requests>=2.31.0", "scikit-learn>=1.4.0", - "scikit-learn-intelex>=2023.2.1; platform_machine == 'x86_64' or platform_machine == 'AMD64'", "scipy>=1.10.1", "shap>=0.43.0", - "sktime[forecasting]>=0.24.0", + "sktime[forecasting]>=0.26.0", "statsmodels>=0.14.1", "zoofs>=0.1.26", ] @@ -52,9 +47,18 @@ dependencies = [ full = [ "botorch>=0.8.5", "catboost>=1.2", + "dagshub>=0.3.8", + "dask[distributed]>=2024.2.0", "explainerdashboard>=0.4.3", "gradio>=3.44.4", "lightgbm>=4.1.0", + "modin[ray]>=0.25.0", + "polars>=0.20.7", + "pyarrow>=15.0.0", + "pyspark>=3.5.0", + "ray[serve]>=2.9.1", + "requests>=2.31.0", + "scikit-learn-intelex>=2023.2.1; platform_machine == 'x86_64' or platform_machine == 'AMD64'", "schemdraw>=0.16", "statsforecast>=1.6.0", "sweetviz>=2.3.1", @@ -70,7 +74,6 @@ dev = [ "pandas_stubs>=2.1.1.230928", "pre-commit>=3.5.0", "ruff>=0.1.7", - "types-requests>=2.31.0.10", # Testing "nbmake>=1.4.1", # To test example notebooks "pytest>=7.2.1", @@ -187,4 +190,8 @@ convention = "numpy" [tool.mypy] ignore_missing_imports = true -disable_error_code = ["attr-defined"] +disable_error_code = [ + "attr-defined", + "abstract", # See https://github.com/python/mypy/issues/4717 + "override", # Transformers' methods don't always match with that of TransformerMixin +] diff --git a/tests/conftest.py b/tests/conftest.py index 9721308ff..97c7858ba 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,7 +11,9 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest +from ray.util.joblib import register_ray from sklearn.base import BaseEstimator from sklearn.datasets import ( load_breast_cancer, load_diabetes, load_wine, @@ -23,7 +25,7 @@ from sktime.split import temporal_train_test_split from atom.data_cleaning import TransformerMixin -from atom.utils.utils import merge, n_cols, to_df, to_pandas +from atom.utils.utils import merge, n_cols, to_df, to_tabular if TYPE_CHECKING: @@ -31,7 +33,7 @@ from _pytest.monkeypatch import MonkeyPatch - from atom.utils.types import DataFrame, Pandas, Sequence, XSelector + from atom.utils.types import DataFrame, Pandas, Sequence, XConstructor class DummyTransformer(TransformerMixin, BaseEstimator): @@ -107,6 +109,18 @@ def _mock_mlflow_log_model(mocker): mocker.patch("mlflow.sklearn.log_model") +@pytest.fixture(autouse=True) +def _register_ray(): + """Register ray as joblib backend. + + Although atom does this internally, it's skipped when ray is + mocked. Not registering it fails the call to joblib.parallel_config + in basetransformer.py. + + """ + register_ray() + + @pytest.fixture() def random(): """Return numpy's default random number generator.""" @@ -114,8 +128,8 @@ def random(): def get_train_test( - X: XSelector | None, - y: Sequence[Any] | DataFrame, + X: XConstructor | None, + y: Sequence[Any] | pd.DataFrame, ) -> Pandas | tuple[Pandas, Pandas]: """Get train and test sets from X and y. @@ -125,7 +139,7 @@ def get_train_test( Feature set. If None, split as time series data set. y: sequence or DataFrame - Target column corresponding to `X`. + Target column(s) corresponding to `X`. Returns ------- @@ -138,7 +152,8 @@ def get_train_test( """ if X is not None: return train_test_split( - merge(to_df(X), to_pandas(y, columns=[f"y{i}" for i in range(n_cols(y))])), + merge(to_df(X), to_tabular(y, columns=[f"y{i}" for i in range(n_cols(y))])), + shuffle=False, test_size=0.3, random_state=1, ) @@ -154,6 +169,9 @@ def get_train_test( X_class, y_class = load_wine(return_X_y=True, as_frame=True) X_reg, y_reg = load_diabetes(return_X_y=True, as_frame=True) +# Pyarrow dtypes +X_pa = X_bin.astype(pd.ArrowDtype(pa.float64())) + # Multilabel classification data X_label, y_label = make_multilabel_classification(n_samples=200, n_classes=4) diff --git a/tests/test_atom.py b/tests/test_atom.py index b59e857b1..f04906c3e 100644 --- a/tests/test_atom.py +++ b/tests/test_atom.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest from category_encoders.target_encoder import TargetEncoder from pandas.testing import assert_frame_equal, assert_index_equal @@ -32,7 +33,7 @@ from .conftest import ( X10, DummyTransformer, X10_dt, X10_nan, X10_str, X10_str2, X20_out, X_bin, - X_class, X_ex, X_label, X_reg, X_sparse, X_text, y10, y10_label, + X_class, X_ex, X_label, X_pa, X_reg, X_sparse, X_text, y10, y10_label, y10_label2, y10_sn, y10_str, y_bin, y_class, y_ex, y_fc, y_label, y_multiclass, y_multireg, y_reg, ) @@ -313,6 +314,13 @@ def test_inverse_transform(): assert_frame_equal(atom.inverse_transform(atom.X), X_bin) +def test_inverse_transform_output(): + """Assert that the output type is determined by the data engine.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pyarrow", random_state=1) + atom.scale() + assert isinstance(atom.inverse_transform(X_bin), pa.Table) + + def test_load_no_atom(): """Assert that an error is raised when the instance is not atom.""" trainer = DirectClassifier("LR", random_state=1) @@ -442,8 +450,8 @@ def test_shrink_dense2sparse(): def test_shrink_pyarrow(): - """Assert that it works with the pyarrow data backend.""" - atom = ATOMClassifier(X_bin, y_bin, engine={"data": "pyarrow"}, random_state=1) + """Assert that it works with pyarrow dtypes.""" + atom = ATOMClassifier(X_pa, y_bin, engine="pandas-pyarrow", random_state=1) assert atom.dtypes[0].name == "double[pyarrow]" atom.shrink() assert atom.dtypes[0].name == "float[pyarrow]" @@ -488,6 +496,13 @@ def test_transform_not_train_only(): assert len(atom.transform(X_bin)) == len(X_bin) +def test_transform_output(): + """Assert that the output type is determined by the data engine.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pyarrow", random_state=1) + atom.scale() + assert isinstance(atom.transform(X_bin), pa.Table) + + # Test base transformers =========================================== >> def test_add_after_model(): @@ -630,15 +645,15 @@ def test_add_keep_column_names(): assert atom.features.tolist() == ["x0", "x1", "x2", "x3"] # Transformer keeps rows equal - atom.add(DummyTransformer(strategy="equal"), get_feature_names_out=None) + atom.add(DummyTransformer(strategy="equal"), feature_names_out=None) assert atom.features.tolist() == ["x0", "x1", "x2", "x3"] # Transformer drops rows - atom.add(DummyTransformer(strategy="drop"), get_feature_names_out=None) + atom.add(DummyTransformer(strategy="drop"), feature_names_out=None) assert atom.features.tolist() == ["x0", "x2", "x3"] # Transformer adds a new column - atom.add(DummyTransformer(strategy="add"), columns="!x2", get_feature_names_out=None) + atom.add(DummyTransformer(strategy="add"), columns="!x2", feature_names_out=None) assert atom.features.tolist() == ["x0", "x2", "x3", "x4"] @@ -649,9 +664,9 @@ def test_raise_length_mismatch(): atom.prune(columns=[2, 4]) -def test_add_pyarrow_columns(): +def test_keep_pyarrow_dtypes(): """Assert that columns keep the pyarrow dtype.""" - atom = ATOMClassifier(X_bin, y_bin, engine="pyarrow", random_state=1) + atom = ATOMClassifier(X_pa, y_bin, random_state=1) assert isinstance(atom.dtypes[0], pd.ArrowDtype) atom.scale() assert isinstance(atom.dtypes[0], pd.ArrowDtype) diff --git a/tests/test_basemodel.py b/tests/test_basemodel.py index db2b3009f..fb1e81183 100644 --- a/tests/test_basemodel.py +++ b/tests/test_basemodel.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd +import polars as pl import pytest import requests from optuna.distributions import CategoricalDistribution, IntDistribution @@ -871,6 +872,13 @@ def test_inverse_transform(): assert_frame_equal(atom.lr.inverse_transform(atom.lr.X), X_bin) +def test_inverse_transform_output(): + """Assert that the output type is determined by the data engine.""" + atom = ATOMClassifier(X_bin, y_bin, engine="polars", random_state=1) + atom.run("Tree") + assert isinstance(atom.tree.inverse_transform(X_bin), pl.DataFrame) + + def test_save_estimator(): """Assert that the save_estimator saves a pickle file.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) @@ -918,6 +926,13 @@ def test_transform(): assert all(-3 <= v <= 3 for v in X.to_numpy().ravel()) # Data is scaled +def test_transform_output(): + """Assert that the output type is determined by the data engine.""" + atom = ATOMClassifier(X_bin, y_bin, engine="polars", random_state=1) + atom.run("Tree") + assert isinstance(atom.tree.transform(X_bin), pl.DataFrame) + + # Test ClassRegModel ================================================== >> def test_classreg_get_tags(): diff --git a/tests/test_baserunner.py b/tests/test_baserunner.py index 1a99684c0..01160c2b1 100644 --- a/tests/test_baserunner.py +++ b/tests/test_baserunner.py @@ -18,7 +18,7 @@ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from atom import ATOMClassifier, ATOMForecaster, ATOMRegressor -from atom.branch import Branch +from atom.data import Branch from atom.training import DirectClassifier, DirectForecaster from atom.utils.types import SPTuple from atom.utils.utils import NotFittedError, merge @@ -319,7 +319,7 @@ def test_results_property_train_sizing(): assert list(atom.results.index.get_level_values(0)) == [0.2, 0.4, 0.6, 0.8, 1.0] -# Test _set_index ================================================== >> +# Test _get_data =================================================== >> def test_index_is_true(): """Assert that the indices are left as is when index=True.""" @@ -406,8 +406,6 @@ def test_duplicate_indices(): ATOMClassifier(X_bin, X_bin, index=True, random_state=1) -# Test _get_stratify_columns======================================== >> - @pytest.mark.parametrize("stratify", [True, -1, "target", [-1]]) def test_stratify_options(stratify): """Assert that the data can be stratified among data sets.""" @@ -437,8 +435,6 @@ def test_stratify_invalid_column_str(): ATOMClassifier(X_bin, y_bin, stratify="invalid", random_state=1) -# Test _get_data =================================================== >> - def test_input_is_y_without_arrays(): """Assert that input y through parameter works.""" atom = ATOMForecaster(y=y_fc, random_state=1) @@ -597,7 +593,7 @@ def test_input_is_train_test_with_parameter_y(): def test_input_is_train_test_for_forecast(): """Assert that input train, test works for forecast tasks.""" - trainer = DirectForecaster("ES", random_state=1) + trainer = DirectForecaster("Croston", random_state=1) trainer.run(fc_train, fc_test) assert_series_equal(trainer.y, pd.concat([fc_train, fc_test])) @@ -606,8 +602,8 @@ def test_input_is_3_tuples(): """Assert that the 3 tuples input works.""" X_train = bin_train.iloc[:, :-1] y_train = bin_train.iloc[:, -1] - X_test = bin_test.iloc[100:-20, :-1] - y_test = bin_test.iloc[100:-20, -1] + X_test = bin_test.iloc[:-20, :-1] + y_test = bin_test.iloc[:-20, -1] X_holdout = bin_test.iloc[-20:, :-1] y_holdout = bin_test.iloc[-20:, -1] @@ -626,7 +622,7 @@ def test_input_is_train_test_holdout(): def test_4_data_provided(): - """Assert that the 4 elements input works.""" + """Assert that the four-element input works.""" X_train = bin_train.iloc[:, :-1] X_test = bin_test.iloc[:, :-1] y_train = bin_train.iloc[:, -1] @@ -638,11 +634,11 @@ def test_4_data_provided(): def test_6_data_provided(): - """Assert that the 6 elements input works.""" + """Assert that the six-element input works.""" X_train = bin_train.iloc[:, :-1] y_train = bin_train.iloc[:, -1] - X_test = bin_test.iloc[100:-20, :-1] - y_test = bin_test.iloc[100:-20, -1] + X_test = bin_test.iloc[:-20, :-1] + y_test = bin_test.iloc[:-20, -1] X_holdout = bin_test.iloc[-20:, :-1] y_holdout = bin_test.iloc[-20:, -1] diff --git a/tests/test_basetrainer.py b/tests/test_basetrainer.py index e782fdd4b..26f1af366 100644 --- a/tests/test_basetrainer.py +++ b/tests/test_basetrainer.py @@ -9,7 +9,6 @@ import mlflow import pytest -import ray from mlflow.tracking.fluent import ActiveRun from optuna.distributions import CategoricalDistribution, IntDistribution from optuna.pruners import MedianPruner @@ -376,21 +375,32 @@ def test_errors_keep(): assert trainer._models == [trainer.lda] -@patch("atom.basetransformer.ray", MagicMock()) -@patch("atom.basetrainer.ray", MagicMock()) +# @patch("atom.basetransformer.ray", MagicMock()) +# @patch("atom.basetrainer.ray", MagicMock()) def test_parallel_with_ray(): """Assert that parallel runs successfully with ray backend.""" trainer = DirectClassifier( models=["LR", "LDA"], parallel=True, - n_jobs=1, + n_jobs=2, backend="ray", random_state=1, ) - # Fails because Mock returns empty list + # Fails because MagicMock returns an empty list with pytest.raises(RuntimeError, match=".*All models failed.*"): trainer.run(bin_train, bin_test) - ray.shutdown() + + +def test_parallel_with_dask(): + """Assert that parallel runs successfully with dask backend.""" + trainer = DirectClassifier( + models=["LR", "LDA"], + parallel=True, + n_jobs=2, + backend="dask", + random_state=1, + ) + trainer.run(bin_train, bin_test) @patch("atom.basetrainer.Parallel", MagicMock()) diff --git a/tests/test_basetransformer.py b/tests/test_basetransformer.py index 3cffdfcd0..9357abdb9 100644 --- a/tests/test_basetransformer.py +++ b/tests/test_basetransformer.py @@ -57,30 +57,11 @@ def test_device_parameter(): assert os.environ["CUDA_VISIBLE_DEVICES"] == "0" -@patch("ray.init") -def test_engine_parameter_modin(ray): - """Assert that ray is initialized when modin is data backend.""" - base = BaseTransformer(device="cpu", engine="modin") - assert base.engine.data == "modin" - assert ray.is_called_once - - -def test_engine_parameter_env_var(): - """Assert that the environment variable is set.""" - base = BaseTransformer(device="cpu", engine="pyarrow") - assert base.engine == EngineTuple(data="pyarrow", estimator="sklearn") - assert os.environ["ATOM_DATA_ENGINE"] == base.engine.data - - base = BaseTransformer(device="cpu", engine="sklearnex") - assert base.engine == EngineTuple(data="pandas", estimator="sklearnex") - assert os.environ["ATOM_DATA_ENGINE"] == base.engine.data - - -@patch.dict("sys.modules", {"sklearnex": None}) -def test_engine_parameter_no_sklearnex(): - """Assert that an error is raised when sklearnex is not installed.""" - with pytest.raises(ModuleNotFoundError, match=".*import scikit-learn-intelex.*"): - BaseTransformer(device="cpu", engine={"estimator": "sklearnex"}) +@pytest.mark.parametrize("engine", [None, "pandas", "sklearn", {}, EngineTuple()]) +def test_engine_parameter(engine): + """Assert that the engine parameter can be initialized.""" + base = BaseTransformer(engine=engine) + assert base.engine == EngineTuple() @pytest.mark.skipif(machine() not in ("x86_64", "AMD64"), reason="Only x86 support") @@ -103,6 +84,13 @@ def test_backend_parameter_ray(ray): assert ray.is_called_once +@patch("dask.distributed.Client") +def test_backend_parameter_dask(dask): + """Assert that dask is initialized when selected.""" + BaseTransformer(backend="dask") + assert dask.is_called_once + + def test_backend_parameter(): """Assert that other backends can be specified.""" base = BaseTransformer(backend="threading") @@ -182,53 +170,6 @@ def test_device_id_invalid(): BaseTransformer(device="gpu:2,3") -# Test _inherit ==================================================== >> - -def test_inherit(): - """Assert that the inherit method passes the parameters correctly.""" - base = BaseTransformer(n_jobs=2, random_state=2) - svc = base._inherit(RandomForestClassifier()) - assert svc.get_params()["n_jobs"] == 2 - assert svc.get_params()["random_state"] == 2 - - -def test_inherit_with_fixed_params(): - """Assert that fixed parameters aren't inherited.""" - base = BaseTransformer(random_state=2) - chain = base._inherit(ClassifierChain(SVC(random_state=3)), ("base_estimator__random_state",)) - assert chain.get_params()["random_state"] == 2 - assert chain.base_estimator.get_params()["random_state"] == 3 - - -def test_inherit_sp(): - """Assert that the seasonal periodicity is correctly inherited.""" - atom = ATOMForecaster(y_fc, sp=[12, 24], random_state=1) - atom.run( - models=["bats", "tbats"], - est_params={ - "bats": {"use_box_cox": False, "use_trend": False, "use_arma_errors": False}, - "tbats": {"use_box_cox": False, "use_trend": False, "use_arma_errors": False}, - }, - ) - assert atom.bats.estimator.get_params()["sp"] == 12 # Single seasonality - assert atom.tbats.estimator.get_params()["sp"] == [12, 24] # Multiple seasonality - - -# Test _get_est_class ============================================== >> - -@pytest.mark.skipif(machine() not in ("x86_64", "AMD64"), reason="Only x86 support") -def test_get_est_class_from_engine(): - """Assert that the class can be retrieved from an engine.""" - base = BaseTransformer(device="cpu", engine={"estimator": "sklearnex"}) - assert base._get_est_class("SVC", "svm") == SVC - - -def test_get_est_class_from_default(): - """Assert that the class is retrieved from sklearn when import fails.""" - base = BaseTransformer(device="cpu", engine={"estimator": "sklearnex"}) - assert base._get_est_class("GaussianNB", "naive_bayes") == GaussianNB - - # Test _check_input ============================================== >> def test_input_is_copied(): @@ -266,7 +207,7 @@ def test_column_order_is_retained(): def test_incorrect_columns(): """Assert that an error is raised when the provided columns do not match.""" - with pytest.raises(ValueError, match=".*features are different.*"): + with pytest.raises(ValueError, match=".*columns are different.*"): BaseTransformer._check_input(X_bin, columns=["1", "2"]) @@ -323,12 +264,6 @@ def test_sparse_matrices_2_tuples(): assert atom[atom.columns[0]].dtype.name == "Sparse[int64, 0]" -def test_target_is_dict(): - """Assert that the target column is assigned correctly for a dict.""" - _, y = BaseTransformer._check_input(X10, {"a": [0] * 10}) - assert isinstance(y, pd.Series) - - def test_multioutput_str(): """Assert that multioutput can be assigned by column name.""" X, y = BaseTransformer._check_input(X_bin, ["mean radius", "worst perimeter"]) @@ -397,6 +332,53 @@ def test_X_empty_df(): assert isinstance(y, pd.Series) +# Test _inherit ==================================================== >> + +def test_inherit(): + """Assert that the inherit method passes the parameters correctly.""" + base = BaseTransformer(n_jobs=2, random_state=2) + svc = base._inherit(RandomForestClassifier()) + assert svc.get_params()["n_jobs"] == 2 + assert svc.get_params()["random_state"] == 2 + + +def test_inherit_with_fixed_params(): + """Assert that fixed parameters aren't inherited.""" + base = BaseTransformer(random_state=2) + chain = base._inherit(ClassifierChain(SVC(random_state=3)), ("base_estimator__random_state",)) + assert chain.get_params()["random_state"] == 2 + assert chain.base_estimator.get_params()["random_state"] == 3 + + +def test_inherit_sp(): + """Assert that the seasonal periodicity is correctly inherited.""" + atom = ATOMForecaster(y_fc, sp=[12, 24], random_state=1) + atom.run( + models=["bats", "tbats"], + est_params={ + "bats": {"use_box_cox": False, "use_trend": False, "use_arma_errors": False}, + "tbats": {"use_box_cox": False, "use_trend": False, "use_arma_errors": False}, + }, + ) + assert atom.bats.estimator.get_params()["sp"] == 12 # Single seasonality + assert atom.tbats.estimator.get_params()["sp"] == [12, 24] # Multiple seasonality + + +# Test _get_est_class ============================================== >> + +@pytest.mark.skipif(machine() not in ("x86_64", "AMD64"), reason="Only x86 support") +def test_get_est_class_from_engine(): + """Assert that the class can be retrieved from an engine.""" + base = BaseTransformer(device="cpu", engine={"estimator": "sklearnex"}) + assert base._get_est_class("SVC", "svm") == SVC + + +def test_get_est_class_from_default(): + """Assert that the class is retrieved from sklearn when import fails.""" + base = BaseTransformer(device="cpu", engine={"estimator": "sklearnex"}) + assert base._get_est_class("GaussianNB", "naive_bayes") == GaussianNB + + # Test log ========================================================= >> def test_log_severity_error(): diff --git a/tests/test_branch.py b/tests/test_data.py similarity index 86% rename from tests/test_branch.py rename to tests/test_data.py index 5484168d4..9bee37d88 100644 --- a/tests/test_branch.py +++ b/tests/test_data.py @@ -7,13 +7,19 @@ import glob import os from pathlib import Path +from unittest.mock import MagicMock, patch +import dask.dataframe as dd +import numpy as np import pandas as pd +import polars as pl +import pyarrow as pa import pytest from pandas.testing import assert_frame_equal +from sklearn.preprocessing import MinMaxScaler, StandardScaler from atom import ATOMClassifier, ATOMRegressor -from atom.branch import Branch, BranchManager +from atom.data import Branch, BranchManager from atom.training import DirectClassifier from atom.utils.utils import merge @@ -276,14 +282,14 @@ def test_data_properties_to_df(): """Assert that the data attributes are converted to a df at setter.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) atom.X = X_bin_array - assert isinstance(atom.X, pd.DataFrame) + assert isinstance(atom.branch.X, pd.DataFrame) def test_data_properties_to_series(): """Assert that the data attributes are converted to a series at setter.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) atom.y = y_bin_array - assert isinstance(atom.y, pd.Series) + assert isinstance(atom.branch.y, pd.Series) def test_setter_error_unequal_rows(): @@ -304,7 +310,7 @@ def test_setter_error_unequal_columns(): """Assert that an error is raised when the setter has unequal columns.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) new_X = atom.train - new_X.insert(0, "new_column", 1) + new_X["new_column"] = 1 with pytest.raises(ValueError, match="number of columns"): atom.train = new_X @@ -561,6 +567,30 @@ def test_load_no_dir(): atom.branch = "main" +def test_check_scaling_scaler_in_pipeline(): + """Assert that check_scaling returns True when there's a scaler in the pipeline.""" + atom = ATOMClassifier(X_bin, y=y_bin, random_state=1) + assert not atom.branch.check_scaling() + atom.add(MinMaxScaler()) + assert atom.branch.check_scaling() + + +def test_check_scaling(): + """Assert that the check_scaling method returns whether the data is scaled.""" + scaler = StandardScaler() + scaler.__class__.__name__ = "OtherName" + + atom = ATOMClassifier(X_bin, y=y_bin, random_state=1) + atom.add(scaler) + assert atom.branch.check_scaling() + + +def test_check_scaling_drop_binary(): + """Assert that binary rows are dropped to check scaling.""" + atom = ATOMClassifier(np.tile(y10, (10, 1)), y=y10, random_state=1) + assert atom.branch.check_scaling() + + # Test BranchManager =============================================== >> def test_branchmanager_repr(): @@ -665,3 +695,76 @@ def test_reset(): assert len(atom._branches) == 1 assert not glob.glob("joblib/atom/Branch(main).pkl") assert atom.og is atom.branch + + +# Test data engines ================================================ >> + +def test_numpy_engine(): + """Assert that the numpy engine returns a numpy array.""" + atom = ATOMClassifier(X_bin, y_bin, engine="numpy", random_state=1) + assert isinstance(atom.dataset, np.ndarray) + + +def test_pandas_numpy_engine(): + """Assert that the pandas engine returns numpy dtypes.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pandas", random_state=1) + assert all(isinstance(dtype, np.dtype) for dtype in atom.dataset.dtypes) + assert isinstance(atom.y.dtype, np.dtype) + + +def test_pandas_pyarrow_engine(): + """Assert that the pandas-pyarrow engine returns pyarrow dtypes.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pandas-pyarrow", random_state=1) + assert all(isinstance(dtype, pd.ArrowDtype) for dtype in atom.dataset.dtypes) + assert isinstance(atom.y.dtype, pd.ArrowDtype) + + +def test_polars_engine(): + """Assert that the polars engine returns polars types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="polars", random_state=1) + assert isinstance(atom.X, pl.DataFrame) + assert isinstance(atom.y, pl.Series) + + +def test_polars_lazy_engine(): + """Assert that the polars-lazy engine returns polars types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="polars-lazy", random_state=1) + assert isinstance(atom.X, pl.LazyFrame) + assert isinstance(atom.y, pl.Series) + + +def test_pyarrow_engine(): + """Assert that the pyarrow engine returns pyarrow types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pyarrow", random_state=1) + assert isinstance(atom.X, pa.Table) + assert isinstance(atom.y, pa.Array) + + +@patch.dict("sys.modules", {"modin": MagicMock(spec=["__spec__", "pandas"])}) +def test_modin_engine(): + """Assert that the modin engine returns modin types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="modin", random_state=1) + assert "DataFrame" in str(atom.X) + assert "Series" in str(atom.y) + + +def test_dask_engine(): + """Assert that the dask engine returns dask types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="dask", random_state=1) + assert isinstance(atom.X, dd.DataFrame) + assert isinstance(atom.y, dd.Series) + + +@patch.dict("sys.modules", {"pyspark.sql": MagicMock(spec=["__spec__", "SparkSession"])}) +def test_pyspark_engine(): + """Assert that the pyspark engine returns pyspark types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pyspark", random_state=1) + assert "createDataFrame" in str(atom.X) + + +@patch.dict("sys.modules", {"pyspark": MagicMock(spec=["__spec__", "pandas"])}) +def test_pyspark_pandas_engine(): + """Assert that the pyspark-pandas engine returns pyspark pandas types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pyspark-pandas", random_state=1) + assert "DataFrame" in str(atom.X) + assert "Series" in str(atom.y) diff --git a/tests/test_data_cleaning.py b/tests/test_data_cleaning.py index c888b4525..8c5bd88ff 100644 --- a/tests/test_data_cleaning.py +++ b/tests/test_data_cleaning.py @@ -423,8 +423,8 @@ def test_missing_values_are_propagated(): def test_unknown_classes_are_imputed(): """Assert that unknown classes are imputed.""" encoder = Encoder() - encoder.fit(["a", "b", "b", "a"]) - assert encoder.transform(["c"]).iloc[0, 0] == -1.0 + encoder.fit([["a"], ["b"], ["b"], ["a"]]) + assert encoder.transform([["c"]]).iloc[0, 0] == -1.0 def test_ordinal_encoder(): diff --git a/tests/test_nlp.py b/tests/test_nlp.py index b7733ae3a..e4826620a 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -5,7 +5,6 @@ """ -from unittest.mock import MagicMock, patch import pandas as pd import pytest @@ -191,21 +190,6 @@ def test_hashing(): assert "hash1" in X -@patch.dict( - "sys.modules", - { - "cuml": MagicMock(spec=["__spec__"]), - "cuml.common.device_selection": MagicMock(spec=["set_global_device_type"]), - "cuml.internals.memory_utils": MagicMock(spec=["set_global_output_type"]), - "cuml.feature_extraction.text": MagicMock(), - }, -) -def test_gpu(): - """Assert that the gpu implementation calls the get method of matrix.""" - vectorizer = Vectorizer(device="gpu", engine="cuml") - pytest.raises(ValueError, vectorizer.fit_transform, X_text) - - def test_return_sparse(): """Assert that the output is sparse.""" X = Vectorizer(strategy="bow", return_sparse=True).fit_transform(X_text, y10) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 586c2a737..d87681fc0 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest from pandas.testing import assert_frame_equal from sklearn.preprocessing import LabelEncoder, StandardScaler @@ -216,6 +217,18 @@ def test_predict_var(pipeline_ts): assert isinstance(pipeline_ts.predict_var(fh=range(3)), pd.DataFrame) +def test_set_output(pipeline): + """Assert that the set_output method determines the data engine.""" + pl = pipeline(model=False) + assert isinstance(pl.transform(X_bin), pd.DataFrame) + + pl.set_output(transform="numpy") + assert isinstance(pl.fit_transform(X_bin, y_bin)[0], np.ndarray) + + pl.set_output(transform="pyarrow") + assert isinstance(pl.inverse_transform(X_bin), pa.Table) + + def test_score_no_parameters(pipeline_ts): """Assert that an error is raised when X and fh are both None.""" with pytest.raises(ValueError, match=".*cannot be both None.*"): diff --git a/tests/test_training.py b/tests/test_training.py index bc3d8f6c9..ce90a0acb 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -36,7 +36,7 @@ def test_models_are_restored(): ) sh.run(reg_train, reg_test) assert "Tree" not in sh._models # The original model is deleted - assert all(m in sh.models for m in ("Tree4", "AdaB2", "LGB1")) + assert all(m in sh.models for m in ("Tree4", "AdaB2", "AdaB1")) def test_ts_int_train_sizes(): diff --git a/tests/test_utils.py b/tests/test_utils.py index 486c94eb3..12f107676 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -6,7 +6,7 @@ """ from datetime import timedelta -from unittest.mock import Mock, patch +from unittest.mock import patch import numpy as np import pytest @@ -18,9 +18,7 @@ from atom import show_versions from atom.pipeline import Pipeline from atom.utils.patches import VotingClassifier, VotingRegressor -from atom.utils.utils import ( - ClassMap, check_is_fitted, time_to_str, to_df, to_series, -) +from atom.utils.utils import ClassMap, check_is_fitted, time_to_str from .conftest import X_bin, X_reg, y_bin, y_reg @@ -158,9 +156,3 @@ def test_time_to_string(): assert time_to_str(timedelta(seconds=17).total_seconds()).startswith("17.00") assert time_to_str(timedelta(minutes=1, seconds=2).total_seconds()) == "01m:02s" assert time_to_str(timedelta(hours=3, minutes=8).total_seconds()) == "03h:08m:00s" - - -def test_to_pandas_with_cuml(): - """Assert that cuML objects use the to_pandas method.""" - to_df(Mock(spec=["to_pandas"]), columns=[0, 1]) - to_series(Mock(spec=["to_pandas"]))