From d049e6a77a6ca5cb41f6938bd0062b7bceffd9da Mon Sep 17 00:00:00 2001 From: Mavs Date: Sun, 11 Feb 2024 20:35:31 +0100 Subject: [PATCH 01/12] data engine 3 --- atom/_show_versions.py | 3 + atom/api.py | 58 ++- atom/atom.py | 172 ++++--- atom/basemodel.py | 233 ++++----- atom/baserunner.py | 201 ++++++-- atom/basetrainer.py | 9 +- atom/basetransformer.py | 285 +++-------- atom/branch/branch.py | 136 ++++-- atom/branch/branchmanager.py | 7 +- atom/branch/dataengines.py | 209 +++++++++ atom/data_cleaning.py | 199 ++++---- atom/feature_engineering.py | 58 +-- atom/models/classreg.py | 16 +- atom/models/custom.py | 3 +- atom/nlp.py | 12 +- atom/pipeline.py | 31 +- atom/plots/baseplot.py | 9 +- atom/plots/hyperparametertuningplot.py | 7 +- atom/plots/predictionplot.py | 12 +- atom/training.py | 9 + atom/utils/types.py | 56 ++- atom/utils/utils.py | 519 +++++++++------------ docs_sources/changelog/v6.x.x.md | 1 + docs_sources/dependencies.md | 5 +- docs_sources/scripts/autodocs.py | 4 +- docs_sources/user_guide/accelerating.md | 70 +-- docs_sources/user_guide/data_management.md | 4 +- docs_sources/user_guide/nomenclature.md | 33 +- pyproject.toml | 3 + tests/conftest.py | 10 +- tests/test_basetrainer.py | 18 +- tests/test_basetransformer.py | 29 +- tests/test_branch.py | 54 ++- 33 files changed, 1361 insertions(+), 1114 deletions(-) create mode 100644 atom/branch/dataengines.py diff --git a/atom/_show_versions.py b/atom/_show_versions.py index ed013853e..22e02a07f 100644 --- a/atom/_show_versions.py +++ b/atom/_show_versions.py @@ -35,6 +35,8 @@ "optuna", "pandas", "plotly", + "polars", + "pyarrow", "ray", "requests", "sklearn", @@ -42,6 +44,7 @@ "scipy", "shap", "sktime", + "statsmodels", "zoofs", # Has no __version__ attribute ] diff --git a/atom/api.py b/atom/api.py index 6bb22eaaf..2fc4acdeb 100644 --- a/atom/api.py +++ b/atom/api.py @@ -158,20 +158,22 @@ class ATOMClassifier(ATOM): **X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features). - **y: int, str or sequence**
- Target column corresponding to `X`. + **y: int, str, dict, sequence or dataframe**
+ Target column(s) corresponding to `X`. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. + - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - If dataframe: Target columns for multioutput tasks. y: int, str, dict, sequence or dataframe, default=-1 - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. + - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - If dataframe: Target columns for multioutput tasks. @@ -276,6 +278,7 @@ class ATOMClassifier(ATOM): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -428,11 +431,12 @@ class ATOMForecaster(ATOM): Exogenous feature set corresponding to y, with shape=(n_samples, n_features). - **y: int, str or sequence**
+ **y: int, str, dict, sequence or dataframe**
Time series. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. + - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - If dataframe: Target columns for multioutput tasks. @@ -440,9 +444,10 @@ class ATOMForecaster(ATOM): y: int, str, dict, sequence or dataframe, default=-1 Time series. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. + - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - If dataframe: Target columns for multioutput tasks. @@ -545,6 +550,7 @@ class ATOMForecaster(ATOM): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -689,21 +695,23 @@ class ATOMRegressor(ATOM): **X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features). - **y: int, str or sequence**
- Target column corresponding to `X`. + **y: int, str, dict, sequence or dataframe**
+ Target column(s) corresponding to `X`. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. + - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - If dataframe: Target columns for multioutput tasks. y: int, str, dict, sequence or dataframe, default=-1 - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. + - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - If dataframe: Target columns for multioutput tasks. @@ -775,9 +783,16 @@ class ATOMRegressor(ATOM): - "data": + - "numpy" - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" - "pyarrow" - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" - "estimator": @@ -794,6 +809,7 @@ class ATOMRegressor(ATOM): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the diff --git a/atom/atom.py b/atom/atom.py index 37abdd8ab..a022f6bcd 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -53,19 +53,18 @@ Backend, Bins, Bool, CategoricalStrats, ColumnSelector, DataFrame, DiscretizerStrats, Engine, EngineTuple, Estimator, FeatureNamesOut, FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero, - FloatLargerZero, FloatZeroToOneInc, Index, IndexSelector, Int, - IntLargerEqualZero, IntLargerTwo, IntLargerZero, MetricConstructor, - ModelsConstructor, NItems, NJobs, NormalizerStrats, NumericalStrats, - Operators, Pandas, Predictor, PrunerStrats, RowSelector, Scalar, - ScalerStrats, Seasonality, Sequence, Series, SPDict, TargetSelector, - Transformer, VectorizerStarts, Verbose, Warnings, XSelector, YSelector, - sequence_t, + FloatLargerZero, FloatZeroToOneInc, IndexSelector, Int, IntLargerEqualZero, + IntLargerTwo, IntLargerZero, MetricConstructor, ModelsConstructor, NItems, + NJobs, NormalizerStrats, NumericalStrats, Operators, Predictor, + PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality, Sequence, + SPDict, Tabular, TargetSelector, Transformer, VectorizerStarts, Verbose, + Warnings, XSelector, YSelector, sequence_t, ) from atom.utils.utils import ( - ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, bk, - check_dependency, check_scaling, composed, crash, fit_one, flt, get_cols, + ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, + check_dependency, composed, crash, fit_one, flt, get_cols, get_custom_scorer, has_task, is_sparse, lst, make_sklearn, merge, - method_to_log, replace_missing, sign, to_pyarrow, + method_to_log, n_cols, replace_missing, sign, ) @@ -156,9 +155,8 @@ def __init__( self._log(f"Parallel processing with {self.n_jobs} cores.", 1) elif self.backend != "loky": self._log( - "Leaving n_jobs=1 ignores all parallelization. Set n_jobs>1 to make use " - f"of the {self.backend} parallelization backend.", - 1, + "Leaving n_jobs=1 ignores all parallelization. Set n_jobs>1 to " + f"make use of the {self.backend} parallelization backend.", 1, severity="warning", ) if "cpu" not in self.device.lower(): @@ -315,27 +313,28 @@ def missing(self, value: Sequence[Any]): def scaled(self) -> bool: """Whether the feature set is scaled. - A data set is considered scaled when it has mean=0 and std=1, - or when there is a scaler in the pipeline. Binary columns (only - zeros and ones) are excluded from the calculation. + A data set is considered scaled when it has mean~0 and std~1, + or when there is a scaler in the pipeline. Categorical and + binary columns (only zeros and ones) are excluded from the + calculation. """ - return check_scaling(self.X, pipeline=self.pipeline) + return self.branch.check_scaling() @property - def duplicates(self) -> Int: + def duplicates(self) -> int: """Number of duplicate rows in the dataset.""" - return self.branch.dataset.duplicated().sum() + return int(self.branch.dataset.duplicated().sum()) @property - def nans(self) -> Series: + def nans(self) -> pd.Series: """Columns with the number of missing values in them. This property is unavailable for [sparse datasets][]. """ - if not is_sparse(self.X): - return replace_missing(self.X, self.missing).isna().sum() + if not is_sparse(self.branch.X): + return replace_missing(self.branch.X, self.missing).isna().sum() raise AttributeError("This property is unavailable for sparse datasets.") @@ -346,16 +345,16 @@ def n_nans(self) -> int: This property is unavailable for [sparse datasets][]. """ - if not is_sparse(self.X): - nans = replace_missing(self.X, self.missing).isna().sum(axis=1) + if not is_sparse(self.branch.X): + nans = replace_missing(self.branch.X, self.missing).isna().sum(axis=1) return len(nans[nans > 0]) raise AttributeError("This property is unavailable for sparse datasets.") @property - def numerical(self) -> Index: + def numerical(self) -> list[str]: """Names of the numerical features in the dataset.""" - return self.X.select_dtypes(include=["number"]).columns + return list(self.branch.X.select_dtypes(include=["number"]).columns) @property def n_numerical(self) -> int: @@ -363,9 +362,9 @@ def n_numerical(self) -> int: return len(self.numerical) @property - def categorical(self) -> Index: + def categorical(self) -> list[str]: """Names of the categorical features in the dataset.""" - return self.X.select_dtypes(include=CAT_TYPES).columns + return list(self.branch.X.select_dtypes(include=CAT_TYPES).columns) @property def n_categorical(self) -> int: @@ -379,7 +378,7 @@ def outliers(self) -> pd.Series: This property is unavailable for [sparse datasets][]. """ - if not is_sparse(self.X): + if not is_sparse(self.branch.X): data = self.branch.train.select_dtypes(include=["number"]) z_scores = np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3 z_scores = pd.Series(z_scores.sum(axis=0), index=data.columns) @@ -388,16 +387,16 @@ def outliers(self) -> pd.Series: raise AttributeError("This property is unavailable for sparse datasets.") @property - def n_outliers(self) -> Int: + def n_outliers(self) -> int: """Number of samples in the training set containing outliers. This property is unavailable for [sparse datasets][]. """ - if not is_sparse(self.X): + if not is_sparse(self.branch.X): data = self.branch.train.select_dtypes(include=["number"]) z_scores = np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3 - return z_scores.any(axis=1).sum() + return int(z_scores.any(axis=1).sum()) raise AttributeError("This property is unavailable for sparse datasets.") @@ -429,14 +428,14 @@ def classes(self) -> pd.DataFrame: raise AttributeError("This property is unavailable for regression tasks.") @property - def n_classes(self) -> Int | Series: + def n_classes(self) -> Int | pd.Series: """Number of classes in the target column(s). This property is only available for classification tasks. """ if self.task.is_classification: - return self.y.nunique(dropna=False) + return self.branch.y.nunique(dropna=False) raise AttributeError("This property is unavailable for regression tasks.") @@ -671,7 +670,7 @@ def inverse_transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> Tabular | tuple[DataFrame, Tabular]: """Inversely transform new data through the pipeline. Transformers that are only applied on the training set are @@ -684,14 +683,14 @@ def inverse_transform( ---------- X: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). - If None, X is ignored in the transformers. + If None, `X` is ignored in the transformers. y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Transformed target column corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -713,7 +712,7 @@ def inverse_transform( X, y = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) with adjust_verbosity(self.pipeline, verbose) as pipeline: - return pipeline.inverse_transform(X, y) + return self._convert(pipeline.inverse_transform(X, y)) @classmethod def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM: @@ -749,11 +748,11 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM **X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features). - **y: int, str or sequence**
- Target column corresponding to `X`. + **y: int, str, dict, sequence or dataframe**
+ Target column(s) corresponding to `X`. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -815,7 +814,7 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM X_test, y_test = branch.pipeline.transform(branch.X_test, branch.y_test) # Update complete dataset - branch._container.data = bk.concat( + branch._container.data = pd.concat( [merge(X_train, y_train), merge(X_test, y_test)] ) @@ -824,7 +823,7 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM data=(dataset := branch._container.data.reset_index(drop=True)), train_idx=dataset.index[:len(branch._container.train_idx)], test_idx=dataset.index[-len(branch._container.test_idx):], - n_cols=branch._container.n_cols, + n_targets=branch._container.n_targets, ) # Store inactive branches in memory @@ -929,7 +928,7 @@ def shrink( """ - def get_data(new_t: DtypeObj) -> Series: + def get_data(new_t: DtypeObj) -> pd.Series: """Get the series with the right data format. Also converts to sparse format if `dense2sparse=True`. @@ -941,7 +940,7 @@ def get_data(new_t: DtypeObj) -> Series: Returns ------- - series + pd.Series Object with the new data type. """ @@ -975,9 +974,6 @@ def get_data(new_t: DtypeObj) -> Series: data = self.branch.dataset[self.branch._get_columns(columns)] - # Convert back since convert_dtypes doesn't work properly for pyarrow dtypes - data = data.astype({n: to_pyarrow(c, inverse=True) for n, c in data.items()}) - # Convert to the best nullable dtype data = data.convert_dtypes() @@ -1012,11 +1008,6 @@ def get_data(new_t: DtypeObj) -> Series: get_data(r[0]) for r in t if r[1] <= column.min() and r[2] >= column.max() ) - if self.engine.data == "pyarrow": - self.branch.dataset = self.dataset.astype( - {name: to_pyarrow(col) for name, col in self.dataset.items()} - ) - self._log("The column dtypes are successfully converted.", 1) @composed(crash, method_to_log) @@ -1030,7 +1021,7 @@ def stats(self, _vb: Int = -2, /): """ self._log("Dataset stats " + "=" * 20 + " >>", _vb) - self._log(f"Shape: {self.shape}", _vb) + self._log(f"Shape: {self.branch.shape}", _vb) if self.task.is_forecast and self.sp.sp: self._log(f"Seasonal period: {self.sp.sp}", _vb) @@ -1041,15 +1032,15 @@ def stats(self, _vb: Int = -2, /): self._log(f" --> From: {min(data.index)} To: {max(data.index)}", _vb) self._log("-" * 37, _vb) - if (memory := self.dataset.memory_usage().sum()) < 1e6: + if (memory := self.branch.dataset.memory_usage().sum()) < 1e6: self._log(f"Memory: {memory / 1e3:.2f} kB", _vb) else: self._log(f"Memory: {memory / 1e6:.2f} MB", _vb) - if is_sparse(self.X): + if is_sparse(self.branch.X): self._log("Sparse: True", _vb) - if hasattr(self.X, "sparse"): # All columns are sparse - self._log(f"Density: {100. * self.X.sparse.density:.2f}%", _vb) + if hasattr(self.branch.X, "sparse"): # All columns are sparse + self._log(f"Density: {100. * self.branch.X.sparse.density:.2f}%", _vb) else: # Not all columns are sparse n_sparse = sum(isinstance(self[c].dtype, pd.SparseDtype) for c in self.features) n_dense = self.n_features - n_sparse @@ -1062,7 +1053,7 @@ def stats(self, _vb: Int = -2, /): n_categorical = self.n_categorical outliers = self.outliers.sum() try: # Can fail for unhashable columns (e.g., multilabel with lists) - duplicates = self.dataset.duplicated().sum() + duplicates = self.branch.dataset.duplicated().sum() except TypeError: duplicates = None self._log( @@ -1071,7 +1062,7 @@ def stats(self, _vb: Int = -2, /): 3, ) - if not self.X.empty: + if not self.branch.X.empty: self._log(f"Scaled: {self.scaled}", _vb) if nans: p_nans = round(100 * nans / self.branch.dataset.size, 1) @@ -1103,27 +1094,26 @@ def transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> Tabular | tuple[DataFrame, Tabular]: """Transform new data through the pipeline. Transformers that are only applied on the training set are skipped. If only `X` or only `y` is provided, it ignores transformers that require the other parameter. This can be - of use to, for example, transform only the target column. + of use to, for example, transform only the target column. Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. If None, - X is ignored in the transformers. + `X` is ignored. y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -1145,7 +1135,7 @@ def transform( X, y = self._check_input(X, y, columns=self.og.features, name=self.og.target) with adjust_verbosity(self.pipeline, verbose) as pipeline: - return pipeline.transform(X, y) + return self._convert(pipeline.transform(X, y)) # Base transformers ============================================ >> @@ -1276,7 +1266,7 @@ def _add_transformer( fit = self._memory.cache(fit_one) kwargs = { "estimator": transformer_c, - "X": self.X_train, + "X": self.branch.X_train, "y": self.y_train, **fit_params, } @@ -1296,35 +1286,41 @@ def _add_transformer( self._branches.add("og") if transformer_c._train_only: - X, y = self.pipeline._mem_transform(transformer_c, self.X_train, self.y_train) - self.train = merge( - self.X_train if X is None else X, - self.y_train if y is None else y, + X, y = self.pipeline._mem_transform( + transformer=transformer_c, + X=self.branch.X_train, + y=self.branch.y_train, ) + + self.branch.train = merge( + self.branch.X_train if X is None else X, + self.branch.y_train if y is None else y, + ) + else: - X, y = self.pipeline._mem_transform(transformer_c, self.X, self.y) - data = merge(self.X if X is None else X, self.y if y is None else y) + X, y = self.pipeline._mem_transform(transformer_c, self.branch.X, self.branch.y) + data = merge(self.branch.X if X is None else X, self.branch.y if y is None else y) # y can change the number of columns or remove rows -> reassign index self.branch._container = DataContainer( data=data, train_idx=self.branch._data.train_idx.intersection(data.index), test_idx=self.branch._data.test_idx.intersection(data.index), - n_cols=self.branch._data.n_cols if y is None else len(get_cols(y)), + n_targets=self.branch._data.n_targets if y is None else n_cols(y), ) if self._config.index is False: self.branch._container = DataContainer( - data=(data := self.dataset.reset_index(drop=True)), + data=(data := self.branch.dataset.reset_index(drop=True)), train_idx=data.index[: len(self.branch._data.train_idx)], test_idx=data.index[-len(self.branch._data.test_idx):], - n_cols=self.branch._data.n_cols, + n_targets=self.branch._data.n_targets, ) if self.branch._holdout is not None: self.branch._holdout.index = range( len(data), len(data) + len(self.branch._holdout) ) - elif self.dataset.index.duplicated().any(): + elif self.branch.dataset.index.duplicated().any(): raise ValueError( "Duplicate indices found in the dataset. " "Try initializing atom using `index=False`." @@ -1477,8 +1473,8 @@ def apply( Parameters ---------- func: callable - Function to apply with signature `func(dataset, **kw_args) -> - dataset`. + Function to apply with signature `func(dataframe, **kw_args) + -> dataframe-like`. inverse_func: callable or None, default=None Inverse function of `func`. If None, the inverse_transform @@ -1729,8 +1725,8 @@ def encode( @composed(crash, method_to_log) def impute( self, - strat_num: Scalar | NumericalStrats = "drop", - strat_cat: str | CategoricalStrats = "drop", + strat_num: Scalar | NumericalStrats = "mean", + strat_cat: str | CategoricalStrats = "most_frequent", *, max_nan_rows: FloatLargerZero | None = None, max_nan_cols: FloatLargerZero | None = None, diff --git a/atom/basemodel.py b/atom/basemodel.py index 584574595..7a418e847 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -67,17 +67,17 @@ from atom.utils.patches import fit_and_score from atom.utils.types import ( HT, Backend, Bool, DataFrame, Engine, FHConstructor, Float, - FloatZeroToOneExc, Index, Int, IntLargerEqualZero, MetricConstructor, + FloatZeroToOneExc, Int, IntLargerEqualZero, MetricConstructor, MetricFunction, NJobs, Pandas, PredictionMethods, PredictionMethodsTS, - Predictor, RowSelector, Scalar, Scorer, Sequence, Stages, TargetSelector, - Verbose, Warnings, XSelector, YSelector, dataframe_t, float_t, int_t, + Predictor, RowSelector, Scalar, Scorer, Sequence, Stages, Tabular, + TargetSelector, Verbose, Warnings, XSelector, YSelector, float_t, int_t, ) from atom.utils.utils import ( ClassMap, DataConfig, Goal, PlotCallback, ShapExplanation, Task, - TrialsCallback, adjust_verbosity, bk, cache, check_dependency, check_empty, - check_scaling, composed, crash, estimator_has_attr, flt, get_cols, - get_custom_scorer, has_task, it, lst, merge, method_to_log, rnd, sign, - time_to_str, to_pandas, + TrialsCallback, adjust_verbosity, cache, check_dependency, check_empty, + composed, crash, estimator_has_attr, flt, get_cols, get_custom_scorer, + has_task, it, lst, merge, method_to_log, rnd, sign, time_to_str, to_df, + to_series, to_tabular, ) @@ -148,6 +148,7 @@ class BaseModel(RunnerPlot): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -264,9 +265,8 @@ def __init__( self._branch = branches.current self._train_idx = len(self.branch._data.train_idx) # Can change for sh and ts - if hasattr(self, "needs_scaling"): - if self.needs_scaling and not check_scaling(self.X, pipeline=self.pipeline): - self.scaler = Scaler().fit(self.X_train) + if getattr(self, "needs_scaling", None) and not self.branch.check_scaling(): + self.scaler = Scaler().fit(self.X_train) def __repr__(self) -> str: """Display class name.""" @@ -274,17 +274,25 @@ def __repr__(self) -> str: def __dir__(self) -> list[str]: """Add additional attrs from __getattr__ to the dir.""" - attrs = list(super().__dir__()) + # Exclude from _available_if conditions + attrs = [x for x in super().__dir__() if hasattr(self, x)] + if "_branch" in self.__dict__: - attrs += [x for x in dir(self.branch) if not x.startswith("_")] - attrs += list(DF_ATTRS) + # Add additional attrs from the branch + attrs += Branch._get_data_attrs() + + # Add additional attrs from the dataset + attrs += [x for x in DF_ATTRS if hasattr(self.dataset, x)] + + # Add column names (excluding those with spaces) attrs += [c for c in self.columns if re.fullmatch(r"\w+$", c)] + return attrs def __getattr__(self, item: str) -> Any: """Get attributes from branch or data.""" if "_branch" in self.__dict__: - if item in dir(self.branch) and not item.startswith("_"): + if item in Branch._get_data_attrs(): return getattr(self.branch, item) # Get attr from branch elif item in self.branch.columns: return self.branch.dataset[item] # Get column @@ -297,7 +305,7 @@ def __contains__(self, item: str) -> bool: """Whether the item is a column in the dataset.""" return item in self.dataset - def __getitem__(self, item: Int | str | list) -> Pandas: + def __getitem__(self, item: Int | str | list) -> Tabular: """Get a subset from the dataset.""" if isinstance(item, int_t): return self.dataset[self.columns[int(item)]] @@ -485,8 +493,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: def _fit_estimator( self, estimator: Predictor, - data: tuple[DataFrame, Pandas], - validation: tuple[DataFrame, Pandas] | None = None, + data: tuple[pd.DataFrame, Pandas], + validation: tuple[pd.DataFrame, Pandas] | None = None, trial: Trial | None = None, ) -> Predictor: """Fit the estimator and perform in-training validation. @@ -641,7 +649,7 @@ def _get_pred( rows: RowSelector, target: TargetSelector | None = None, method: PredictionMethods | Sequence[PredictionMethods] = "predict", - ) -> tuple[Pandas, Pandas]: + ) -> tuple[Tabular, Tabular]: """Get the true and predicted values for a column. Predictions are made using the `decision_function` or @@ -688,7 +696,7 @@ def _get_pred( # Statsmodels models such as SARIMAX and DF require all # exogenous data after the last row of the train set # Other models accept this format - Xe = bk.concat([self.test, self.holdout]) # type: ignore[list-item] + Xe = pd.concat([self.test, self.holdout]) # type: ignore[list-item] exog = Xe.loc[Xe.index <= X.index.max(), self.features] # type: ignore[index] y_pred = self._prediction( @@ -704,7 +712,7 @@ def _get_pred( f"Failed to get predictions for model {self.name} " f"on rows {rows}. Returning NaN. Exception: {ex}.", 3 ) - y_pred = bk.Series([np.NaN] * len(X), index=X.index) + y_pred = pd.Series([np.NaN] * len(X), index=X.index) else: y_pred = self._prediction(X.index, verbose=0, method=method_caller) @@ -722,7 +730,7 @@ def _score_from_est( self, scorer: Scorer, estimator: Predictor, - X: DataFrame, + X: pd.DataFrame, y: Pandas, **kwargs, ) -> Float: @@ -736,11 +744,11 @@ def _score_from_est( estimator: Predictor Estimator instance to get the score from. - X: dataframe + X: pd.DataFrame Feature set. - y: series or dataframe - Target column corresponding to `X`. + y: pd.Series or pd.DataFrame + Target column(s) corresponding to `X`. **kwargs Additional keyword arguments for the `scorer`. @@ -754,7 +762,7 @@ def _score_from_est( if self.task.is_forecast: y_pred = estimator.predict(fh=y.index, X=check_empty(X)) else: - y_pred = to_pandas( + y_pred = to_tabular( data=estimator.predict(X), index=y.index, columns=getattr(y, "columns", None), @@ -766,8 +774,8 @@ def _score_from_est( def _score_from_pred( self, scorer: Scorer, - y_true: Pandas, - y_pred: Pandas, + y_true: Tabular, + y_pred: Tabular, **kwargs, ) -> Float: """Calculate the metric score from predicted values. @@ -854,7 +862,7 @@ def _get_score( and hasattr(self.estimator, "predict_proba") ): y_true, y_pred = self._get_pred(rows, method="predict_proba") - if isinstance(y_pred, dataframe_t): + if isinstance(y_pred, pd.DataFrame): # Update every target column with its corresponding threshold for i, value in enumerate(threshold): y_pred.iloc[:, i] = (y_pred.iloc[:, i] > value).astype("int") @@ -1025,7 +1033,7 @@ def fit_model( args.append(cols) # Parallel loop over fit_model - results = Parallel(n_jobs=self.n_jobs, backend=self.backend)( + results = Parallel(n_jobs=self.n_jobs)( delayed(fit_model)(estimator, i, j) for i, j in splitter.split(*args) ) @@ -1150,7 +1158,7 @@ def fit_model( self._log(f"Time elapsed: {time_to_str(self.trials.iat[-1, -2])}", 1) @composed(crash, method_to_log, beartype) - def fit(self, X: DataFrame | None = None, y: Pandas | None = None): + def fit(self, X: pd.DataFrame | None = None, y: Pandas | None = None): """Fit and validate the model. The estimator is fitted using the best hyperparameters found @@ -1160,12 +1168,12 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None): Parameters ---------- - X: dataframe or None + X: pd.DataFrame or None Feature set with shape=(n_samples, n_features). If None, `self.X_train` is used. - y: series, dataframe or None - Target column corresponding to `X`. If None, `self.y_train` + y: pd.Series, pd.DataFrame or None + Target column(s) corresponding to `X`. If None, `self.y_train` is used. """ @@ -1233,28 +1241,25 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None): sk_model=self.estimator, artifact_path=self._est_class.__name__, signature=infer_signature( - model_input=pd.DataFrame(self.X), + model_input=self.X, model_output=self.estimator.predict(self.X_test.iloc[[0]]), ), - input_example=pd.DataFrame(self.X.iloc[[0]]), + input_example=self.X.iloc[[0]], ) if self.log_data: for ds in ("train", "test"): - mlflow.log_input( - dataset=from_pandas(pd.DataFrame(getattr(self, ds))), - context=ds, - ) + mlflow.log_input(dataset=from_pandas(getattr(self, ds)), context=ds) if self.log_pipeline: mlflow.sklearn.log_model( sk_model=self.export_pipeline(), artifact_path=f"{self._est_class.__name__}_pipeline", signature=infer_signature( - model_input=pd.DataFrame(self.X), + model_input=self.X, model_output=self.estimator.predict(self.X_test.iloc[[0]]), ), - input_example=pd.DataFrame(self.X.iloc[[0]]), + input_example=self.X.iloc[[0]], ) @composed(crash, method_to_log, beartype) @@ -1629,22 +1634,22 @@ def pipeline(self) -> Pipeline: return self.branch.pipeline @property - def dataset(self) -> DataFrame: + def dataset(self) -> pd.DataFrame: """Complete data set.""" return merge(self.X, self.y) @property - def train(self) -> DataFrame: + def train(self) -> pd.DataFrame: """Training set.""" return merge(self.X_train, self.y_train) @property - def test(self) -> DataFrame: + def test(self) -> pd.DataFrame: """Test set.""" return merge(self.X_test, self.y_test) @property - def holdout(self) -> DataFrame | None: + def holdout(self) -> pd.DataFrame | None: """Holdout set.""" if (holdout := self.branch.holdout) is not None: if self.scaler: @@ -1655,17 +1660,17 @@ def holdout(self) -> DataFrame | None: return None @property - def X(self) -> DataFrame: + def X(self) -> pd.DataFrame: """Feature set.""" - return bk.concat([self.X_train, self.X_test]) + return pd.concat([self.X_train, self.X_test]) @property def y(self) -> Pandas: - """Target column.""" - return bk.concat([self.y_train, self.y_test]) + """Target column(s).""" + return pd.concat([self.y_train, self.y_test]) @property - def X_train(self) -> DataFrame: + def X_train(self) -> pd.DataFrame: """Features of the training set.""" features = self.branch.features.isin(self._config.ignore) if self.scaler: @@ -1679,7 +1684,7 @@ def y_train(self) -> Pandas: return self.branch.y_train[-self._train_idx:] @property - def X_test(self) -> DataFrame: + def X_test(self) -> pd.DataFrame: """Features of the test set.""" features = self.branch.features.isin(self._config.ignore) if self.scaler: @@ -1688,7 +1693,7 @@ def X_test(self) -> DataFrame: return self.branch.X_test.iloc[:, ~features] @property - def X_holdout(self) -> DataFrame | None: + def X_holdout(self) -> pd.DataFrame | None: """Features of the holdout set.""" if self.holdout is not None: return self.holdout[self.features] @@ -1709,34 +1714,34 @@ def shape(self) -> tuple[Int, Int]: return self.dataset.shape @property - def columns(self) -> Index: + def columns(self) -> list[str]: """Name of all the columns.""" - return self.dataset.columns + return list(self.dataset.columns) @property - def n_columns(self) -> Int: + def n_columns(self) -> int: """Number of columns.""" return len(self.columns) @property - def features(self) -> Index: + def features(self) -> list[str]: """Name of the features.""" - return self.columns[:-self.branch._data.n_cols] + return list(self.columns[:-self.branch._data.n_targets]) @property - def n_features(self) -> Int: + def n_features(self) -> int: """Number of features.""" return len(self.features) @property - def _all(self) -> DataFrame: + def _all(self) -> pd.DataFrame: """Dataset + holdout. Note that calling this property triggers the holdout set calculation. """ - return bk.concat([self.dataset, self.holdout]) + return pd.concat([self.dataset, self.holdout]) # Utility methods ============================================== >> @@ -1837,8 +1842,7 @@ def inference(*X) -> Scalar | str | list[Scalar | str]: """ conv = lambda elem: elem.item() if hasattr(elem, "item") else elem - y_pred = self.inverse_transform(y=self.predict([X], verbose=0), verbose=0) - if isinstance(y_pred, dataframe_t): + if isinstance(y_pred := self.predict([X], verbose=0), pd.DataFrame): return [conv(elem) for elem in y_pred.iloc[0, :]] else: return conv(y_pred[0]) @@ -1859,7 +1863,7 @@ def inference(*X) -> Scalar | str | list[Scalar | str]: self.app = Interface( fn=inference, inputs=inputs, - outputs=["label"] * self.branch._data.n_cols, + outputs=["label"] * self.branch._data.n_targets, allow_flagging=kwargs.pop("allow_flagging", "never"), **{k: v for k, v in kwargs.items() if k in sign(Interface)}, ) @@ -2082,12 +2086,12 @@ def evaluate( """ if isinstance(threshold, float_t): - threshold_c = [threshold] * self.branch._data.n_cols # Length=n_targets - elif len(threshold) != self.branch._data.n_cols: + threshold_c = [threshold] * self.branch._data.n_targets # Length=n_targets + elif len(threshold) != self.branch._data.n_targets: raise ValueError( "Invalid value for the threshold parameter. The length of the list " f"list should be equal to the number of target columns, got len(target)" - f"={self.branch._data.n_cols} and len(threshold)={len(threshold)}." + f"={self.branch._data.n_targets} and len(threshold)={len(threshold)}." ) else: threshold_c = list(threshold) @@ -2185,8 +2189,8 @@ def full_train(self, *, include_holdout: Bool = False): raise ValueError("No holdout data set available.") if include_holdout and self.holdout is not None: - X = bk.concat([self.X, self.X_holdout]) - y = bk.concat([self.y, self.y_holdout]) + X = pd.concat([self.X, self.X_holdout]) + y = pd.concat([self.y, self.y_holdout]) else: X, y = self.X, self.y @@ -2234,7 +2238,7 @@ def inverse_transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> Tabular | tuple[DataFrame, Tabular]: """Inversely transform new data through the pipeline. Transformers that are only applied on the training set are @@ -2249,14 +2253,14 @@ def inverse_transform( ---------- X: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). - If None, X is ignored in the transformers. + If None, `X` is ignored in the transformers. y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -2413,7 +2417,7 @@ async def __call__(self, request: Request) -> np.ndarray: """ payload = await request.json() - return getattr(self.pipeline, self.method)(bk.read_json(payload)) + return getattr(self.pipeline, self.method)(pd.read_json(payload)) if not ray.is_initialized(): ray.init(log_to_driver=False) @@ -2433,7 +2437,7 @@ def transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> Tabular | tuple[DataFrame, Tabular]: """Transform new data through the pipeline. Transformers that are only applied on the training set are @@ -2448,14 +2452,14 @@ def transform( X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, - X is ignored in the transformers. + `X` is ignored in the transformers. y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -2530,7 +2534,7 @@ def _prediction( sample_weight: Sequence[Scalar] | None = ..., verbose: Int | None = ..., method: PredictionMethods = ..., - ) -> Pandas: ... + ) -> Tabular: ... def _prediction( self, @@ -2540,7 +2544,7 @@ def _prediction( sample_weight: Sequence[Scalar] | None = None, verbose: Int | None = None, method: PredictionMethods = "predict", - ) -> Float | Pandas: + ) -> Float | Tabular: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2555,11 +2559,11 @@ def _prediction( on. y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -2590,7 +2594,7 @@ def _prediction( """ - def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[DataFrame, Pandas]: + def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[DataFrame, Tabular]: """Get X and y from the pipeline transformation. Parameters @@ -2599,7 +2603,7 @@ def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[DataFrame, Pandas]: Feature set. y: int, str or sequence - Target column. + Target column(s). Returns ------- @@ -2630,7 +2634,7 @@ def assign_prediction_columns() -> list[str]: return self.mapping.get(self.target, np.unique(self.y).astype(str)) try: - if isinstance(X, dataframe_t): + if isinstance(X, pd.DataFrame): # Dataframe must go first since we can expect # prediction calls from dataframes with reset indices Xt, yt = get_transform_X_y(X, y) @@ -2645,25 +2649,22 @@ def assign_prediction_columns() -> list[str]: if method != "score": pred = np.array(self.memory.cache(getattr(self.estimator, method))(Xt[self.features])) - if pred.ndim < 3: - data = to_pandas( - data=pred, - index=Xt.index, - name=self.target, - columns=assign_prediction_columns(), - ) + if pred.ndim == 1: + data = to_series(pred, index=Xt.index, name=self.target) + elif pred.ndim < 3: + data = to_df(pred, index=Xt.index, columns=assign_prediction_columns()) elif self.task is Task.multilabel_classification: # Convert to (n_samples, n_targets) - data = bk.DataFrame( + data = pd.DataFrame( data=np.array([d[:, 1] for d in pred]).T, index=Xt.index, columns=assign_prediction_columns(), ) else: # Convert to (n_samples * n_classes, n_targets) - data = bk.DataFrame( + data = pd.DataFrame( data=pred.reshape(-1, pred.shape[2]), - index=bk.MultiIndex.from_tuples( + index=pd.MultiIndex.from_tuples( [(col, idx) for col in np.unique(self.y) for idx in Xt.index] ), columns=assign_prediction_columns(), @@ -2692,7 +2693,7 @@ def decision_function( X: RowSelector | XSelector, *, verbose: Int | None = None, - ) -> Pandas: + ) -> Tabular: """Get confidence scores on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2731,7 +2732,7 @@ def predict( *, inverse: Bool = True, verbose: Int | None = None, - ) -> Pandas: + ) -> Tabular: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2877,11 +2878,11 @@ def score( on. y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - If None: `X` must be a selection of rows in the dataset. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -2966,7 +2967,7 @@ def _prediction( verbose: Int | None = None, method: PredictionMethodsTS = ..., **kwargs, - ) -> Pandas: ... + ) -> Tabular: ... def _prediction( self, @@ -2977,7 +2978,7 @@ def _prediction( verbose: Int | None = None, method: PredictionMethodsTS = "predict", **kwargs, - ) -> Float | Pandas: + ) -> Float | Tabular: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3051,8 +3052,9 @@ def predict( fh: RowSelector | FHConstructor, X: XSelector | None = None, *, + inverse: Bool = True, verbose: Int | None = None, - ) -> Pandas: + ) -> Tabular: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3070,6 +3072,12 @@ def predict( X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to `fh`. + inverse: bool, default=True + Whether to inversely transform the output through the + pipeline. This doesn't affect the predictions if there are + no transformers in the pipeline or if the transformers have + no `inverse_transform` method or don't apply to `y`. + verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. @@ -3081,7 +3089,12 @@ def predict( n_targets) for [multivariate][] tasks. """ - return self._prediction(fh=fh, X=X, verbose=verbose, method="predict") + pred = self._prediction(fh=fh, X=X, verbose=verbose, method="predict") + + if inverse: + return self.inverse_transform(y=pred) + else: + return pred @available_if(estimator_has_attr("predict_interval")) @composed(crash, method_to_log, beartype) @@ -3236,7 +3249,7 @@ def predict_residuals( X: XSelector | None = None, *, verbose: Int | None = None, - ) -> Pandas: + ) -> Tabular: """Get residuals of forecasts on new data or existing rows. New data is first transformed through the model's pipeline. diff --git a/atom/baserunner.py b/atom/baserunner.py index c85f95222..ff6d51a63 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -15,7 +15,7 @@ from copy import deepcopy from functools import cached_property from pathlib import Path -from typing import Any +from typing import Any, overload, Literal import dill as pickle import numpy as np @@ -39,14 +39,14 @@ from atom.utils.types import ( Bool, DataFrame, FloatZeroToOneExc, HarmonicsSelector, IndexSelector, Int, IntLargerOne, MetricConstructor, Model, ModelSelector, ModelsSelector, - Pandas, RowSelector, Seasonality, Segment, Sequence, Series, SPDict, - SPTuple, TargetSelector, YSelector, bool_t, dataframe_t, int_t, segment_t, - sequence_t, + RowSelector, Seasonality, Segment, Sequence, Series, SPDict, SPTuple, + Tabular, TargetSelector, XSelector, YSelector, bool_t, int_t, pandas_t, + segment_t, sequence_t, ) from atom.utils.utils import ( - ClassMap, DataContainer, Goal, SeasonalPeriod, Task, bk, check_is_fitted, + ClassMap, DataContainer, Goal, SeasonalPeriod, Task, check_is_fitted, composed, crash, divide, flt, get_cols, get_segment, get_versions, - has_task, lst, merge, method_to_log, n_cols, + has_task, lst, merge, method_to_log, n_cols, to_df, to_tabular, ) @@ -80,27 +80,42 @@ def __setstate__(self, state: dict[str, Any]): def __dir__(self) -> list[str]: """Add additional attrs from __getattr__ to the dir.""" - attrs = list(super().__dir__()) - attrs += [x for x in dir(self.branch) if not x.startswith("_")] - attrs += list(DF_ATTRS) + # Exclude from _available_if conditions + attrs = [x for x in super().__dir__() if hasattr(self, x)] + + # Add additional attrs from the branch + attrs += Branch._get_data_attrs() + + # Add additional attrs from the dataset + attrs += [x for x in DF_ATTRS if hasattr(self.dataset, x)] + + # Add branch names in lower-case attrs += [b.name.lower() for b in self._branches] + + # Add column names (excluding those with spaces) attrs += [c for c in self.columns if re.fullmatch(r"\w+$", c)] + + # Add model names in lower-case if isinstance(self._models, ClassMap): attrs += [m.name.lower() for m in self._models] + return attrs def __getattr__(self, item: str) -> Any: """Get branch, attr from branch, model, column or attr from dataset.""" if item in self.__dict__["_branches"]: return self._branches[item] # Get branch - elif item in dir(self.branch) and not item.startswith("_"): - return getattr(self.branch, item) # Get attr from branch + elif item in Branch._get_data_attrs(): + if isinstance(attr := getattr(self.branch, item), pandas_t): + return self._convert(attr) # Transform data through data engine + else: + return attr elif item in self.__dict__["_models"]: return self._models[item] # Get model elif item in self.branch.columns: return self.branch.dataset[item] # Get column from dataset - elif item in DF_ATTRS: - return getattr(self.branch.dataset, item) # Get attr from dataset + elif item in DF_ATTRS and hasattr(self.dataset, item): + return getattr(self.dataset, item) # Get attr from dataset else: raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'.") @@ -120,7 +135,7 @@ def __delattr__(self, item: str): def __len__(self) -> int: """Return length of dataset.""" - return len(self.dataset) + return len(self.branch.dataset) def __contains__(self, item: str) -> bool: """Whether the item is a column in the dataset.""" @@ -159,7 +174,7 @@ def __sklearn_is_fitted__(self) -> bool: @cached_property def task(self) -> Task: """Dataset's [task][] type.""" - return self._goal.infer_task(self.y) + return self._goal.infer_task(self.branch.y) @property def sp(self) -> SPTuple: @@ -316,6 +331,130 @@ def frac(m: Model) -> float: # Utility methods ============================================== >> + @staticmethod + @overload + def _check_input(X: XSelector, y: Literal[None]) -> tuple[DataFrame, None]: ... + + @staticmethod + @overload + def _check_input(X: Literal[None], y: YSelector) -> tuple[None, Tabular]: ... + + @staticmethod + @overload + def _check_input(X: XSelector, y: YSelector) -> tuple[DataFrame, Tabular]: ... + + @staticmethod + def _check_input( + X: XSelector | None = None, + y: YSelector | None = None, + ) -> tuple[DataFrame | None, Tabular | None]: + """Prepare the input data. + + Convert X and y to pandas (if not already) and perform standard + compatibility checks (dimensions, length, indices, etc...). + + Parameters + ---------- + X: dataframe-like or None, default=None + Feature set with shape=(n_samples, n_features). If None, + X is ignored. + + y: int, str, dict, sequence, dataframe or None, default=None + Target column(s) corresponding to `X`. + + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. + - If dict: Name of the target column and sequence of values. + - If sequence: Target column with shape=(n_samples,) or + sequence of column names or positions for multioutput + tasks. + - If dataframe: Target columns for multioutput tasks. + + Returns + ------- + dataframe or None + Feature dataset. Only returned if provided. + + series, dataframe or None + Target column(s) corresponding to `X`. + + """ + Xt: pd.DataFrame | None = None + yt: Pandas | None = None + + if X is None and y is None: + raise ValueError("X and y can't be both None!") + elif X is not None: + Xt = to_df(deepcopy(X() if callable(X) else X)) + + # If text dataset, change the name of the column to corpus + if list(Xt.columns) == ["x0"] and Xt[Xt.columns[0]].dtype == "object": + Xt = Xt.rename(columns={Xt.columns[0]: "corpus"}) + else: + # Convert all column names to str + Xt.columns = Xt.columns.astype(str) + + # No duplicate rows nor column names are allowed + if Xt.columns.duplicated().any(): + raise ValueError("Duplicate column names found in X.") + + # Prepare target column + if isinstance(y, (dict, *sequence_t, DataFrame)): + if isinstance(y, dict): + yt = to_tabular(deepcopy(y), index=getattr(Xt, "index", None)) + else: + # If X and y have different number of rows, try multioutput + if Xt is not None and len(Xt) != len(y): + try: + targets: list[Hashable] = [] + for col in y: + if col in Xt.columns: + targets.append(col) + elif isinstance(col, int_t): + if -Xt.shape[1] <= col < Xt.shape[1]: + targets.append(Xt.columns[int(col)]) + else: + raise IndexError( + "Invalid value for the y parameter. Value " + f"{col} is out of range for data with " + f"{Xt.shape[1]} columns." + ) + + Xt, yt = Xt.drop(columns=targets), Xt[targets] + + except (TypeError, IndexError, KeyError): + raise ValueError( + "X and y don't have the same number of rows," + f" got len(X)={len(Xt)} and len(y)={len(y)}." + ) from None + else: + yt = y + + yt = to_tabular(deepcopy(yt), index=getattr(Xt, "index", None)) + + # Check X and y have the same indices + if Xt is not None and not Xt.index.equals(yt.index): + raise ValueError("X and y don't have the same indices!") + + elif isinstance(y, str): + if Xt is not None: + if y not in Xt.columns: + raise ValueError(f"Column {y} not found in X!") + + Xt, yt = Xt.drop(columns=y), Xt[y] + + else: + raise ValueError("X can't be None when y is a string.") + + elif isinstance(y, int_t): + if Xt is None: + raise ValueError("X can't be None when y is an int.") + + Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]] + + return Xt, yt + def _get_sp(self, sp: Seasonality) -> int | list[int] | None: """Get the seasonal period. @@ -378,11 +517,11 @@ def get_single_sp(sp: Int | str) -> int: def _get_data( self, - arrays: tuple, + arrays: tuple[Any, ...], y: YSelector = -1, *, index: IndexSelector = False, - ) -> tuple[DataContainer, DataFrame | None]: + ) -> tuple[DataContainer, pd.DataFrame | None]: """Get data sets from a sequence of indexables. Also assigns an index, (stratified) shuffles and selects a @@ -404,7 +543,7 @@ def _get_data( DataContainer Train and test sets. - dataframe or None + pd.DataFrame or None Holdout data set. Returns None if not specified. """ @@ -439,7 +578,7 @@ def _subsample(df: DataFrame) -> DataFrame: else: return df.iloc[sorted(random.sample(range(len(df)), k=n_rows))] - def _set_index(df: DataFrame, y: Pandas | None) -> DataFrame: + def _set_index(df: DataFrame, y: Tabular | None) -> DataFrame: """Assign an index to the dataframe. Parameters @@ -495,7 +634,7 @@ def _set_index(df: DataFrame, y: Pandas | None) -> DataFrame: def _no_data_sets( X: DataFrame, - y: Pandas, + y: Tabular, ) -> tuple[DataContainer, DataFrame | None]: """Generate data sets from one dataset. @@ -589,18 +728,18 @@ def _no_data_sets( stratify=self._config.get_stratify_columns(data, y), ) - complete_set = _set_index(bk.concat([train, test, holdout]), y) + complete_set = _set_index(pd.concat([train, test, holdout]), y) container = DataContainer( data=(data := complete_set.iloc[: len(data)]), train_idx=data.index[:-len(test)], test_idx=data.index[-len(test):], - n_cols=len(get_cols(y)), + n_targets=n_cols(y), ) except ValueError as ex: # Clarify common error with stratification for multioutput tasks - if "least populated class" in str(ex) and isinstance(y, dataframe_t): + if "least populated class" in str(ex) and isinstance(y, pd.DataFrame): raise ValueError( "Stratification for multioutput tasks is applied over all target " "columns, which results in a least populated class that has only " @@ -617,11 +756,11 @@ def _no_data_sets( def _has_data_sets( X_train: DataFrame, - y_train: Pandas, + y_train: Tabular, X_test: DataFrame, - y_test: Pandas, + y_test: Tabular, X_holdout: DataFrame | None = None, - y_holdout: Pandas | None = None, + y_holdout: Tabular | None = None, ) -> tuple[DataContainer, DataFrame | None]: """Generate data sets from provided sets. @@ -701,13 +840,13 @@ def _has_data_sets( if holdout is not None: holdout.index = index[-len(holdout):] - complete_set = _set_index(bk.concat([train, test, holdout]), y_test) + complete_set = _set_index(pd.concat([train, test, holdout]), y_test) container = DataContainer( data=(data := complete_set.iloc[:len(train) + len(test)]), train_idx=data.index[: len(train)], test_idx=data.index[-len(test):], - n_cols=len(get_cols(y_train)), + n_targets=n_cols(y_train), ) if holdout is not None: @@ -787,7 +926,7 @@ def _has_data_sets( if self._goal.name == "forecast": # For forecasting, check if index complies with sktime's standard valid, msg, _ = check_is_mtype( - obj=pd.DataFrame(bk.concat([sets[0].data, sets[1]])), + obj=pd.DataFrame(pd.concat([sets[0].data, sets[1]])), mtype="pd.DataFrame", return_metadata=True, var_name="the dataset", @@ -797,7 +936,7 @@ def _has_data_sets( raise ValueError(msg) else: # Else check for duplicate indices - if bk.concat([sets[0].data, sets[1]]).index.duplicated().any(): + if pd.concat([sets[0].data, sets[1]]).index.duplicated().any(): raise ValueError( "Duplicate indices found in the dataset. " "Try initializing atom using `index=False`." @@ -1175,7 +1314,7 @@ def get_sample_weight(self, rows: RowSelector = "train") -> Series: """ _, y = self.branch._get_rows(rows, return_X_y=True) weights = compute_sample_weight("balanced", y=y) - return bk.Series(weights, name="sample_weight").round(3) + return pd.Series(weights, name="sample_weight").round(3) @available_if(has_task("forecast")) @composed(crash, beartype) diff --git a/atom/basetrainer.py b/atom/basetrainer.py index b88819538..8310d34ae 100644 --- a/atom/basetrainer.py +++ b/atom/basetrainer.py @@ -12,7 +12,7 @@ from datetime import datetime as dt from typing import Any -import joblib +import dask import mlflow import numpy as np import ray @@ -380,8 +380,10 @@ def execute_model(m: Model) -> Model | None: # each run the function execute_remote = ray.remote(num_cpus=self.n_jobs)(execute_model) models = ray.get([execute_remote.remote(m) for m in self._models]) + elif self.backend == "dask": + models = dask.compute(*[dask.delayed(execute_model)(m) for m in self._models]) else: - models = Parallel(n_jobs=self.n_jobs, backend=self.backend)( + models = Parallel(n_jobs=self.n_jobs)( delayed(execute_model)(m) for m in self._models ) @@ -391,8 +393,7 @@ def execute_model(m: Model) -> Model | None: m.verbose = vb else: - with joblib.parallel_backend(backend=self.backend): - models = [model for m in self._models if (model := execute_model(m))] + models = [model for m in self._models if (model := execute_model(m))] self._models = ClassMap(m for m in models if m) diff --git a/atom/basetransformer.py b/atom/basetransformer.py index d5697754f..4ba7753b3 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -12,37 +12,34 @@ import re import tempfile import warnings -from collections.abc import Hashable -from copy import deepcopy from datetime import datetime as dt from importlib import import_module from importlib.util import find_spec from logging import DEBUG, FileHandler, Formatter, Logger, getLogger from multiprocessing import cpu_count from pathlib import Path -from typing import Literal, TypeVar, overload +from typing import Any, TypeVar import dagshub +import joblib import mlflow import numpy as np +import pandas as pd import ray import requests from beartype import beartype from dagshub.auth.token_auth import HTTPBearerAuth +from dask.distributed import Client from joblib.memory import Memory -from pandas._typing import Axes from ray.util.joblib import register_ray from sklearn.utils.validation import check_memory from atom.utils.types import ( - Backend, Bool, DataFrame, Engine, EngineDataOptions, - EngineEstimatorOptions, EngineTuple, Estimator, FeatureNamesOut, Int, - IntLargerEqualZero, Pandas, Sequence, Severity, Verbose, Warnings, - XSelector, YSelector, bool_t, dataframe_t, int_t, sequence_t, -) -from atom.utils.utils import ( - crash, flt, lst, make_sklearn, n_cols, to_df, to_pandas, + Backend, Bool, Engine, EngineDataOptions, EngineEstimatorOptions, + EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Severity, + Verbose, Warnings, bool_t, ) +from atom.utils.utils import check_dependency, crash, lst, make_sklearn T_Estimator = TypeVar("T_Estimator", bound=Estimator) @@ -136,17 +133,11 @@ def engine(self, value: Engine): data=value.get("data", EngineTuple().data), estimator=value.get("estimator", EngineTuple().estimator), ) - else: - engine = value # type: ignore[assignment] - - if engine.data == "modin" and not ray.is_initialized(): - ray.init( - runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_Pandas__": "1"}}, - log_to_driver=False, - ) + elif isinstance(value, EngineTuple): + engine = value - # Update env variable to use for PandasModin in utils.py - os.environ["ATOM_DATA_ENGINE"] = engine.data + # Make sure the data engine library is installed + check_dependency(engine.data_engine.library) if engine.estimator == "sklearnex": if not find_spec("sklearnex"): @@ -189,6 +180,13 @@ def backend(self, value: Backend): register_ray() # Register ray as joblib backend if not ray.is_initialized(): ray.init(log_to_driver=False) + elif value == "dask": + try: + Client.current() + except ValueError: + Client(processes=False) + + joblib.parallel_config(backend=value) self._backend = value @@ -359,6 +357,55 @@ def _device_id(self) -> int: # Methods ====================================================== >> + def _convert(self, obj: Any) -> Any: + """Convert data to the type set in the data engine. + + Non-pandas types are returned as is. + + Parameters + ---------- + obj: object + Object to convert. + + Returns + ------- + object + Converted data or unchanged object. + + """ + # Only apply transformations when the engine is defined + if hasattr(self, "engine") and isinstance(obj, pd.Series | pd.DataFrame): + return self.engine.data_engine.convert(obj) + else: + return obj + + def _get_est_class(self, name: str, module: str) -> type[Estimator]: + """Import a class from a module. + + When the import fails, for example, if atom uses sklearnex and + that's passed to a transformer, use sklearn's (default engine). + + Parameters + ---------- + name: str + Name of the class to get. + + module: str + Module from which to get the class. + + Returns + ------- + Estimator + Class of the estimator. + + """ + try: + mod = import_module(f"{self.engine.estimator}.{module}") + except (ModuleNotFoundError, AttributeError): + mod = import_module(f"sklearn.{module}") + + return make_sklearn(getattr(mod, name)) + def _inherit( self, obj: T_Estimator, fixed: tuple[str, ...] = (), @@ -409,202 +456,6 @@ def _inherit( return make_sklearn(obj, feature_names_out=feature_names_out) - def _get_est_class(self, name: str, module: str) -> type[Estimator]: - """Import a class from a module. - - When the import fails, for example, if atom uses sklearnex and - that's passed to a transformer, use sklearn's (default engine). - - Parameters - ---------- - name: str - Name of the class to get. - - module: str - Module from which to get the class. - - Returns - ------- - Estimator - Class of the estimator. - - """ - try: - mod = import_module(f"{self.engine.estimator}.{module}") - except (ModuleNotFoundError, AttributeError): - mod = import_module(f"sklearn.{module}") - - return make_sklearn(getattr(mod, name)) - - @staticmethod - @overload - def _check_input( - X: XSelector, - y: Literal[None], - columns: Axes, - name: Literal[None], - ) -> tuple[DataFrame, None]: ... - - @staticmethod - @overload - def _check_input( - X: Literal[None], - y: YSelector, - columns: Literal[None], - name: str | Sequence[str], - ) -> tuple[None, Pandas]: ... - - @staticmethod - @overload - def _check_input( - X: XSelector, - y: YSelector, - columns: Axes | None = ..., - name: str | Sequence[str] | None = ..., - ) -> tuple[DataFrame, Pandas]: ... - - @staticmethod - def _check_input( - X: XSelector | None = None, - y: YSelector | None = None, - columns: Axes | None = None, - name: str | Sequence[str] | None = None, - ) -> tuple[DataFrame | None, Pandas | None]: - """Prepare the input data. - - Convert X and y to pandas (if not already) and perform standard - compatibility checks (dimensions, length, indices, etc...). - - Parameters - ---------- - X: dataframe-like or None, default=None - Feature set with shape=(n_samples, n_features). If None, - X is ignored. - - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. - - columns: sequence or None, default=None - Names of the features corresponding to `X`. If X already is a - dataframe, force feature order. If None and X is not a - dataframe, assign default feature names. - - name: str, sequence or None, default=None - Name of the target column(s) corresponding to y. If None and - y is not a pandas object, assign default target name. - - Returns - ------- - dataframe or None - Feature dataset. Only returned if provided. - - series, dataframe or None - Target column corresponding to `X`. - - """ - Xt: DataFrame | None = None - yt: Pandas | None = None - - if X is None and y is None: - raise ValueError("X and y can't be both None!") - elif X is not None: - Xt = to_df(deepcopy(X() if callable(X) else X), columns=columns) - - # If text dataset, change the name of the column to corpus - if list(Xt.columns) == ["x0"] and Xt[Xt.columns[0]].dtype == "object": - Xt = Xt.rename(columns={Xt.columns[0]: "corpus"}) - else: - # Convert all column names to str - Xt.columns = Xt.columns.astype(str) - - # No duplicate rows nor column names are allowed - if Xt.columns.duplicated().any(): - raise ValueError("Duplicate column names found in X.") - - # Reorder columns to original order - if columns is not None: - try: - Xt = Xt[list(columns)] # Force order determined by columns - except KeyError: - raise ValueError( - f"The features are different than seen at fit time. " - f"Features {set(Xt.columns) - set(columns)} are missing in X." - ) from None - - # Prepare target column - if isinstance(y, (dict, *sequence_t, *dataframe_t)): - if isinstance(y, dict): - yt = to_df(deepcopy(y), index=getattr(Xt, "index", None)) - if n_cols(yt) == 1: - yt = yt.iloc[:, 0] # If y is one-dimensional, get series - - else: - # If X and y have different number of rows, try multioutput - if Xt is not None and len(Xt) != len(y): - try: - targets: list[Hashable] = [] - for col in y: - if col in Xt.columns: - targets.append(col) - elif isinstance(col, int_t): - if -Xt.shape[1] <= col < Xt.shape[1]: - targets.append(Xt.columns[int(col)]) - else: - raise IndexError( - "Invalid value for the y parameter. Value " - f"{col} is out of range for data with " - f"{Xt.shape[1]} columns." - ) - - Xt, yt = Xt.drop(columns=targets), Xt[targets] - - except (TypeError, IndexError, KeyError): - raise ValueError( - "X and y don't have the same number of rows," - f" got len(X)={len(Xt)} and len(y)={len(y)}." - ) from None - else: - yt = y - - default_cols = [f"y{i}" for i in range(n_cols(y))] - yt = to_pandas( - data=deepcopy(yt), - index=getattr(Xt, "index", None), - name=flt(name) if name is not None else "target", - columns=name if isinstance(name, sequence_t) else default_cols, - ) - - # Check X and y have the same indices - if Xt is not None and not Xt.index.equals(yt.index): - raise ValueError("X and y don't have the same indices!") - - elif isinstance(y, str): - if Xt is not None: - if y not in Xt.columns: - raise ValueError(f"Column {y} not found in X!") - - Xt, yt = Xt.drop(columns=y), Xt[y] - - else: - raise ValueError("X can't be None when y is a string.") - - elif isinstance(y, int_t): - if Xt is None: - raise ValueError("X can't be None when y is an int.") - - Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]] - - return Xt, yt - @crash def _log(self, msg: str, level: Int = 0, severity: Severity = "info"): """Print message and save to log file. diff --git a/atom/branch/branch.py b/atom/branch/branch.py index 8481386a8..f179fd55b 100644 --- a/atom/branch/branch.py +++ b/atom/branch/branch.py @@ -15,6 +15,7 @@ from warnings import filterwarnings import dill as pickle +import pandas as pd from beartype import beartype from beartype.roar import BeartypeDecorHintPep585DeprecationWarning from joblib.memory import Memory @@ -22,12 +23,13 @@ from atom.pipeline import Pipeline from atom.utils.types import ( - Bool, ColumnSelector, DataFrame, Index, Int, IntLargerEqualZero, Pandas, - RowSelector, Scalar, Sequence, TargetSelector, TargetsSelector, XSelector, - YSelector, dataframe_t, index_t, int_t, segment_t, series_t, + Bool, ColumnSelector, DataFrame, Int, IntLargerEqualZero, Pandas, + RowSelector, Scalar, Sequence, TargetSelector, TargetsSelector, + XConstructor, XSelector, YSelector, int_t, segment_t, ) from atom.utils.utils import ( - DataContainer, bk, flt, get_cols, lst, merge, to_pandas, + DataContainer, check_scaling, flt, get_col_names, get_cols, lst, merge, + to_tabular, ) @@ -58,16 +60,16 @@ class Branch: name: str Name of the branch. - memory: str, [Memory][joblibmemory] or None, default=None - Memory object for pipeline caching and to store the data when - the branch is inactive. - data: DataContainer or None, default=None Data for the branch. - holdout: dataframe or None, default=None + holdout: pd.DataFrame or None, default=None Holdout data set. + memory: str, [Memory][joblibmemory] or None, default=None + Memory object for pipeline caching and to store the data when + the branch is inactive. + See Also -------- atom.branch:BranchManager @@ -101,9 +103,10 @@ class Branch: def __init__( self, name: str, - memory: str | Memory | None = None, data: DataContainer | None = None, - holdout: DataFrame | None = None, + holdout: pd.DataFrame | None = None, + *, + memory: str | Memory | None = None, ): self.name = name self.memory = check_memory(memory) @@ -164,11 +167,11 @@ def name(self, value: str): def _check_setter( self, name: str, - value: Sequence[Scalar | str] | XSelector, + value: Sequence[Scalar | str] | XConstructor, ) -> Pandas: """Check the data set's setter property. - Convert the property to a pandas object and compare with the + Convert the property to a 'pandas' object and compare with the rest of the dataset, to check if it has the right indices and dimensions. @@ -182,7 +185,7 @@ def _check_setter( Returns ------- - series or dataframe + pd.Series or pd.DataFrame Data set. """ @@ -226,11 +229,10 @@ def counter(name: str, dim: str) -> str | None: if under_name := counter(name, "under"): under = getattr(self, under_name) - obj = to_pandas( + obj = to_tabular( data=value, index=side.index if side_name else None, - name=getattr(under, "name", "target") if under_name else "target", - columns=getattr(under, "columns", None) if under_name else None, + columns=get_col_names(under) if under_name else None, ) if side_name: # Check for equal rows @@ -246,7 +248,7 @@ def counter(name: str, dim: str) -> str | None: ) if under_name: # Check for equal columns - if isinstance(obj, series_t): + if isinstance(obj, pd.Series): if obj.name != under.name: raise ValueError( f"{name} and {under_name} must have the " @@ -292,7 +294,7 @@ def mapping(self) -> dict[str, dict[Hashable, Scalar]]: return self._mapping @property - def dataset(self) -> DataFrame: + def dataset(self) -> pd.DataFrame: """Complete data set.""" return self._data.data @@ -301,29 +303,29 @@ def dataset(self, value: XSelector): self._data.data = self._check_setter("dataset", value) @property - def train(self) -> DataFrame: + def train(self) -> pd.DataFrame: """Training set.""" return self._data.data.loc[self._data.train_idx] @train.setter def train(self, value: XSelector): df = self._check_setter("train", value) - self._data.data = bk.concat([df, self.test]) + self._data.data = pd.concat([df, self.test]) self._data.train_idx = df.index @property - def test(self) -> DataFrame: + def test(self) -> pd.DataFrame: """Test set.""" return self._data.data.loc[self._data.test_idx] @test.setter def test(self, value: XSelector): df = self._check_setter("test", value) - self._data.data = bk.concat([self.train, df]) + self._data.data = pd.concat([self.train, df]) self._data.test_idx = df.index @cached_property - def holdout(self) -> DataFrame | None: + def holdout(self) -> pd.DataFrame | None: """Holdout set.""" if self._holdout is not None: return merge( @@ -336,7 +338,7 @@ def holdout(self) -> DataFrame | None: return None @property - def X(self) -> DataFrame: + def X(self) -> pd.DataFrame: """Feature set.""" return self._data.data[self.features] @@ -356,14 +358,14 @@ def y(self, value: YSelector): self._data.data = merge(self.X, series) @property - def X_train(self) -> DataFrame: + def X_train(self) -> pd.DataFrame: """Features of the training set.""" return self.train[self.features] @X_train.setter def X_train(self, value: XSelector): df = self._check_setter("X_train", value) - self._data.data = bk.concat([merge(df, self.y_train), self.test]) + self._data.data = pd.concat([merge(df, self.y_train), self.test]) @property def y_train(self) -> Pandas: @@ -373,17 +375,17 @@ def y_train(self) -> Pandas: @y_train.setter def y_train(self, value: YSelector): series = self._check_setter("y_train", value) - self._data.data = bk.concat([merge(self.X_train, series), self.test]) + self._data.data = pd.concat([merge(self.X_train, series), self.test]) @property - def X_test(self) -> DataFrame: + def X_test(self) -> pd.DataFrame: """Features of the test set.""" return self.test[self.features] @X_test.setter def X_test(self, value: XSelector): df = self._check_setter("X_test", value) - self._data.data = bk.concat([self.train, merge(df, self.y_test)]) + self._data.data = pd.concat([self.train, merge(df, self.y_test)]) @property def y_test(self) -> Pandas: @@ -393,7 +395,7 @@ def y_test(self) -> Pandas: @y_test.setter def y_test(self, value: YSelector): series = self._check_setter("y_test", value) - self._data.data = bk.concat([self.train, merge(self.X_test, series)]) + self._data.data = pd.concat([self.train, merge(self.X_test, series)]) @property def shape(self) -> tuple[Int, Int]: @@ -401,42 +403,58 @@ def shape(self) -> tuple[Int, Int]: return self.dataset.shape @property - def columns(self) -> Index: + def columns(self) -> pd.Index: """Name of all the columns.""" return self.dataset.columns @property - def n_columns(self) -> Int: + def n_columns(self) -> int: """Number of columns.""" return len(self.columns) @property - def features(self) -> Index: + def features(self) -> pd.Index: """Name of the features.""" - return self.columns[:-self._data.n_cols] + return self.columns[:-self._data.n_targets] @property - def n_features(self) -> Int: + def n_features(self) -> int: """Number of features.""" return len(self.features) @property def target(self) -> str | list[str]: """Name of the target column(s).""" - return flt(list(self.columns[-self._data.n_cols:])) + return flt(list(self.columns[-self._data.n_targets:])) @property - def _all(self) -> DataFrame: + def _all(self) -> pd.DataFrame: """Dataset + holdout. Note that calling this property triggers the holdout set calculation. """ - return bk.concat([self.dataset, self.holdout]) + return pd.concat([self.dataset, self.holdout]) # Utility methods ============================================== >> + @classmethod + def _get_data_attrs(cls) -> list[str]: + """Get the data attributes of the class. + + Returns + ------- + list of str + Data properties. + + """ + return [ + x + for x in dir(cls) + if isinstance(getattr(cls, x), property) and not x.startswith("_") + ] + @overload def _get_rows( self, @@ -451,14 +469,14 @@ def _get_rows( rows: RowSelector, *, return_X_y: Literal[True], - ) -> tuple[DataFrame, Pandas]: ... + ) -> tuple[pd.DataFrame, Pandas]: ... def _get_rows( self, rows: RowSelector, *, return_X_y: Bool = False, - ) -> DataFrame | tuple[DataFrame, Pandas]: + ) -> pd.DataFrame | tuple[pd.DataFrame, Pandas]: """Get a subset of the rows. Rows can be selected by name, index, data set or regex pattern. @@ -479,10 +497,10 @@ def _get_rows( Returns ------- - dataframe + pd.DataFrame Subset of rows. - series or dataframe + pd.Series or pd.Dataframe Subset of target column. Only returned if return_X_y=True. """ @@ -490,9 +508,9 @@ def _get_rows( inc: list[Hashable] = [] exc: list[Hashable] = [] - if isinstance(rows, dataframe_t): + if isinstance(rows, pd.DataFrame): inc.extend(rows.index) - elif isinstance(rows, index_t): + elif isinstance(rows, pd.Index): inc.extend(rows) elif isinstance(rows, segment_t): inc.extend(_all.index[rows]) @@ -590,7 +608,7 @@ def _get_columns( return list(df.select_dtypes(include=["number"]).columns) else: return list(df.columns) - elif isinstance(columns, dataframe_t): + elif isinstance(columns, pd.DataFrame): inc.extend(list(columns.columns)) elif isinstance(columns, segment_t): inc.extend(list(df.columns[columns])) @@ -755,7 +773,7 @@ def get_class( if only_columns and not isinstance(target, tuple): return get_column(target) elif isinstance(target, tuple): - if not isinstance(self.y, dataframe_t): + if not isinstance(self.y, pd.DataFrame): raise ValueError( f"Invalid value for the target parameter, got {target}. " "A tuple is only accepted for multioutput tasks." @@ -831,3 +849,27 @@ def store(self, *, assign: Bool = True): if assign: self._container = None + + def check_scaling(self) -> bool: + """Whether the feature set is scaled. + + A data set is considered scaled when it has mean~0 and std~1, + or when there is a scaler in the pipeline. Categorical and + binary columns (only zeros and ones) are excluded from the + calculation. + + Returns + ------- + bool + Whether the feature set is scaled. + + """ + if any("scaler" in name.lower() for name in self.pipeline.named_steps): + return True + + df = self.X.loc[:, (~self.X.isin([0, 1])).any(axis=0)] # Remove binary columns + + if df.empty: # All columns are binary -> no scaling needed + return True + else: + return check_scaling(df) diff --git a/atom/branch/branchmanager.py b/atom/branch/branchmanager.py index 0d2a36f7d..7a0cd96d6 100644 --- a/atom/branch/branchmanager.py +++ b/atom/branch/branchmanager.py @@ -11,12 +11,13 @@ from collections.abc import Iterator from copy import copy, deepcopy +import pandas as pd from beartype import beartype from joblib.memory import Memory from sklearn.utils.validation import check_memory from atom.branch.branch import Branch -from atom.utils.types import Bool, DataFrame, Int +from atom.utils.types import Bool, Int from atom.utils.utils import ClassMap, DataContainer @@ -99,7 +100,7 @@ def __repr__(self) -> str: """Print containing branches.""" return f"BranchManager([{', '.join(self.branches.keys())}], og={self.og.name})" - def __len__(self) -> Int: + def __len__(self) -> int: """Get the number of branches in the manager.""" return len(self.branches) @@ -212,7 +213,7 @@ def add(self, name: str, parent: Branch | None = None): if parent: self._copy_from_parent(self.current, parent) - def fill(self, data: DataContainer, holdout: DataFrame | None = None): + def fill(self, data: DataContainer, holdout: pd.DataFrame | None = None): """Fill the current branch with data. Parameters diff --git a/atom/branch/dataengines.py b/atom/branch/dataengines.py new file mode 100644 index 000000000..95c118034 --- /dev/null +++ b/atom/branch/dataengines.py @@ -0,0 +1,209 @@ +"""Automated Tool for Optimized Modeling (ATOM). + +Author: Mavs +Description: Module containing the data engines. + +""" + +from __future__ import annotations + +from abc import ABCMeta, abstractmethod + +import dask.dataframe as dd +import modin.pandas as md +import numpy as np +import pandas as pd +import polars as pl +import pyarrow as pa +import pyspark +import pyspark.pandas as ps + +from atom.utils.types import Any, DataFrame, Pandas, Sequence + + +class DataEngine(metaclass=ABCMeta): + """Abstract class for data engines. + + Data engines convert a pandas object to a specific type. + The type is determined by the data engine. + + """ + + @staticmethod + @abstractmethod + def convert(obj: Pandas) -> np.ndarray | Sequence[Any] | DataFrame: ... + + +class NumpyEngine(DataEngine): + """Numpy data engine.""" + + library = "numpy" + + @staticmethod + def convert(obj: Pandas) -> np.ndarray: + """Convert to numpy array.""" + return obj.to_numpy() + + +class PandasNumpyEngine(DataEngine): + """Pandas numpy data engine.""" + + library = "pandas" + + @staticmethod + def convert(obj: Pandas) -> Pandas: + """Convert to numpy dtypes.""" + if isinstance(obj, pd.DataFrame): + return obj.astype( + { + c: t.numpy_dtype + for c, t in obj.dtypes.items() + if hasattr(t, "numpy_dtype") + } + ) + elif hasattr(obj.dtype, "numpy_dtype"): + return obj.astype(obj.dtype.numpy_dtype) + else: + return obj + + +class PandasPyarrowEngine(DataEngine): + """Pandas pyarrow data engine.""" + + library = "pandas" + + @staticmethod + def convert(obj: Pandas) -> Pandas: + """Convert to pyarrow dtypes.""" + if isinstance(obj, pd.DataFrame): + return obj.astype( + { + c: pd.ArrowDtype(pa.from_numpy_dtype(t)) + for c, t in obj.dtypes.items() + if isinstance(t, np.dtype) + } + ) + elif isinstance(obj.dtype, np.dtype): + return obj.astype(pd.ArrowDtype(pa.from_numpy_dtype(obj.dtype))) + else: + return obj + + +class PolarsEngine(DataEngine): + """Polars data engine.""" + + library = "polars" + + @staticmethod + def convert(obj: Pandas) -> pl.Series | pl.DataFrame: + """Convert to polars objects.""" + import polars as pl + + if isinstance(obj, pd.DataFrame): + return pl.DataFrame(obj) + elif isinstance(obj, pd.Series): + return pl.Series(obj) + + +class PolarsLazyEngine(DataEngine): + """Polars lazy data engine.""" + + library = "polars" + + @staticmethod + def convert(obj: Pandas) -> pl.Series | pl.DataFrame: + """Convert to lazy polars objects.""" + import polars as pl + + if isinstance(obj, pd.DataFrame): + return pl.LazyFrame(obj) + elif isinstance(obj, pd.Series): + return pl.Series(obj) + + +class PyArrowEngine(DataEngine): + """PyArrow data engine.""" + + library = "pyarrow" + + @staticmethod + def convert(obj: Pandas) -> pa.Array | pa.Table: + """Convert to pyarrow objects.""" + import pyarrow as pa + + if isinstance(obj, pd.DataFrame): + return pa.Table.from_pandas(obj) + elif isinstance(obj, pd.Series): + return pa.Array.from_pandas(obj) + + +class ModinEngine(DataEngine): + """Modin data engine.""" + + library = "modin" + + @staticmethod + def convert(obj: Pandas) -> md.Series | md.DataFrame: + """Convert to modin objects.""" + import modin.pandas as md + + if isinstance(obj, pd.DataFrame): + return md.DataFrame(obj) + elif isinstance(obj, pd.Series): + return md.Series(obj) + + +class DaskEngine(DataEngine): + """Dask data engine.""" + + library = "dask" + + @staticmethod + def convert(obj: Pandas) -> dd.Series | dd.DataFrame: + """Convert to dask objects.""" + import dask.dataframe as dd + + return dd.from_pandas(obj) + + +class PySparkEngine(DataEngine): + """PySpark data engine.""" + + library = "pyspark" + + @staticmethod + def convert(obj: Pandas) -> pyspark.sql.DataFrame: + """Convert to pyspark objects.""" + from pyspark.sql import SparkSession + + spark = SparkSession.builder.appName("atom-ml").getOrCreate() + return spark.createDataFrame(obj) + + +class PySparkPandasEngine(DataEngine): + """PySpark data engine with pandas API.""" + + library = "pyspark" + + @staticmethod + def convert(obj: Pandas) -> ps.Series | ps.DataFrame: + """Convert to pyspark objects.""" + import pyspark.pandas as ps + + if isinstance(obj, pd.DataFrame): + return ps.DataFrame(obj) + elif isinstance(obj, pd.Series): + return ps.Series(obj) + + +DATA_ENGINES = { + "numpy": NumpyEngine, + "pandas": PandasNumpyEngine, + "pandas-pyarrow": PandasPyarrowEngine, + "polars": PolarsEngine, + "polars-lazy": PolarsLazyEngine, + "modin": ModinEngine, + "dask": DaskEngine, + "pyspark": PySparkEngine, + "pyspark-pandas": PySparkPandasEngine, +} diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index 165e31475..07840e0a7 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -55,18 +55,17 @@ Bins, Bool, CategoricalStrats, DataFrame, DiscretizerStrats, Engine, EngineTuple, Estimator, FloatLargerZero, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, - Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels, - Sequence, Series, Transformer, Verbose, XConstructor, YConstructor, - dataframe_t, sequence_t, series_t, + Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels, Sequence, + Tabular, Transformer, Verbose, XConstructor, YConstructor, sequence_t, ) from atom.utils.utils import ( - Goal, bk, check_is_fitted, composed, crash, get_col_order, get_cols, it, - lst, make_sklearn, merge, method_to_log, n_cols, replace_missing, sign, - to_df, to_series, variable_return, wrap_transformer_methods, + Goal, check_is_fitted, composed, crash, get_col_order, get_cols, it, lst, + make_sklearn, merge, method_to_log, n_cols, replace_missing, sign, to_df, + to_series, variable_return, wrap_transformer_methods, ) -T = TypeVar("T", bound=Transformer) +T_Transformer = TypeVar("T_Transformer", bound=Transformer) @beartype @@ -107,7 +106,7 @@ def __repr__(self, N_CHAR_MAX: Int = 700) -> str: return out - def __sklearn_clone__(self: T) -> T: + def __sklearn_clone__(self: T_Transformer) -> T_Transformer: """Wrap cloning method to attach internal attributes.""" cloned = _clone_parametrized(self) @@ -135,17 +134,7 @@ def fit( X is ignored. y: int, str, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe-like: Target columns with shape=(n_samples, - n_targets) for multioutput tasks. + Target column(s) corresponding to `X`. **fit_params Additional keyword arguments for the fit method. @@ -166,7 +155,7 @@ def fit_transform( X: XConstructor | None = None, y: YConstructor | None = None, **fit_params, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> Tabular | tuple[DataFrame, Tabular]: """Fit to data, then transform it. Parameters @@ -176,17 +165,7 @@ def fit_transform( X is ignored. y: int, str, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe-like: Target columns with shape=(n_samples, - n_targets) for multioutput tasks. + Target column(s) corresponding to `X`. **fit_params Additional keyword arguments for the fit method. @@ -207,7 +186,7 @@ def inverse_transform( self, X: XConstructor | None = None, y: YConstructor | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> Tabular | tuple[DataFrame, Tabular]: """Do nothing. Returns the input unchanged. Implemented for continuity of the @@ -220,11 +199,11 @@ def inverse_transform( X is ignored. y: int, str, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -238,7 +217,7 @@ def inverse_transform( Feature set. Only returned if provided. series or dataframe - Target column. Only returned if provided. + Target column(s). Only returned if provided. """ return variable_return(X, y) @@ -365,7 +344,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas = -1) -> Self: + def fit(self, X: DataFrame, y: Tabular = -1) -> Self: """Fit to data. Parameters @@ -374,11 +353,11 @@ def fit(self, X: DataFrame, y: Pandas = -1) -> Self: Feature set with shape=(n_samples, n_features). y: int, str, dict or sequence, default=-1 - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -391,7 +370,7 @@ def fit(self, X: DataFrame, y: Pandas = -1) -> Self: Estimator instance. """ - if isinstance(y, series_t): + if isinstance(y, pd.Series): self.target_names_in_ = np.array([y.name]) else: raise ValueError("The Balancer class does not support multioutput tasks.") @@ -454,7 +433,7 @@ def fit(self, X: DataFrame, y: Pandas = -1) -> Self: return self @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas = -1) -> tuple[DataFrame, Series]: + def transform(self, X: DataFrame, y: Tabular = -1) -> tuple[DataFrame, Series]: """Balance the data. Parameters @@ -463,10 +442,10 @@ def transform(self, X: DataFrame, y: Pandas = -1) -> tuple[DataFrame, Series]: Feature set with shape=(n_samples, n_features). y: int, str or sequence, default=-1 - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - Else: Array with shape=(n_samples,) to use as target. Returns @@ -563,7 +542,7 @@ def log_changes(y): self._log(f" --> Removing {diff} samples from class: {key}.", 2) # Add the new samples to the old dataframe - X, y = bk.concat([X, X_new]), bk.concat([y, y_new]) + X, y = pd.concat([X, X_new]), pd.concat([y, y_new]) return X, y @@ -734,7 +713,7 @@ def __init__( self.encode_target = encode_target @composed(crash, method_to_log) - def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: + def fit(self, X: DataFrame | None = None, y: Tabular | None = None) -> Self: """Fit to data. Parameters @@ -744,11 +723,11 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: X is ignored. y: int, str, dict, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -774,13 +753,13 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: self._drop_cols = list(X.select_dtypes(include=lst(self.drop_dtypes)).columns) if y is not None: - if isinstance(y, series_t): + if isinstance(y, pd.Series): self.target_names_in_ = np.array([y.name]) else: self.target_names_in_ = y.columns.to_numpy() if self.drop_chars: - if isinstance(y, series_t): + if isinstance(y, pd.Series): y.name = re.sub(self.drop_chars, "", str(y.name)) else: y = y.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) @@ -833,8 +812,8 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> def transform( self, X: DataFrame | None = None, - y: Pandas | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + y: Tabular | None = None, + ) -> Tabular | tuple[DataFrame, Tabular]: """Apply the data cleaning steps to the data. Parameters @@ -844,11 +823,11 @@ def transform( X is ignored. y: int, str, dict, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -899,7 +878,7 @@ def transform( if y is not None: if self.drop_chars: - if isinstance(y, series_t): + if isinstance(y, pd.Series): y.name = re.sub(self.drop_chars, "", str(y.name)) else: y = y.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) @@ -932,7 +911,7 @@ def transform( ) # Replace target with encoded column(s) - if isinstance(y, series_t): + if isinstance(y, pd.Series): yt = out else: yt = merge(yt, out) @@ -951,8 +930,8 @@ def transform( def inverse_transform( self, X: DataFrame | None = None, - y: Pandas | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + y: Tabular | None = None, + ) -> Tabular | tuple[DataFrame, Tabular]: """Inversely transform the label encoding. This method only inversely transforms the target encoding. @@ -965,11 +944,11 @@ def inverse_transform( Do nothing. Implemented for continuity of the API. y: int, str, dict, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -993,22 +972,22 @@ def inverse_transform( if est := self._estimators.get(col): if est.__class__.__name__ == "LabelEncoder": self._log(f" --> Inversely label-encoding column {col}.", 2) - out = est.inverse_transform(bk.DataFrame(y)[col]) + out = est.inverse_transform(pd.DataFrame(y)[col]) - elif isinstance(y, dataframe_t): + elif isinstance(y, pd.DataFrame): self._log(f" --> Inversely label-binarizing column {col}.", 2) out = est.inverse_transform( y.loc[:, y.columns.str.startswith(f"{col}_")].to_numpy() ) # Replace encoded columns with target column - if isinstance(y, series_t): + if isinstance(y, pd.Series): yt = to_series(out, y.index, col) else: yt = merge(yt, to_series(out, y.index, col)) else: # Add unchanged column - yt = merge(yt, bk.DataFrame(y)[col]) + yt = merge(yt, pd.DataFrame(y)[col]) y = yt @@ -1156,7 +1135,7 @@ def __init__( self.seasonal_model = seasonal_model @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: """Fit to data. Parameters @@ -1225,7 +1204,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: return self @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Decompose the data. Parameters @@ -1250,7 +1229,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: return X @composed(crash, method_to_log) - def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def inverse_transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Inversely transform the data. Parameters @@ -1445,7 +1424,7 @@ def __init__( self.labels = labels @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: """Fit to data. Parameters @@ -1566,14 +1545,14 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: # Make of cut a transformer self._estimators[col] = FunctionTransformer( - func=bk.cut, + func=pd.cut, kw_args={"bins": bins_c, "labels": get_labels(col, bins_c)}, ).fit(X[[col]]) return self @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Bin the data into intervals. Parameters @@ -1762,7 +1741,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: """Fit to data. Note that leaving y=None can lead to errors if the `strategy` @@ -1775,11 +1754,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: Feature set with shape=(n_samples, n_features). y: int, str, dict, sequence or dataframe-like - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -1936,7 +1915,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> return get_col_order(cols, self.feature_names_in_, self._estimator.feature_names_in_) @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Encode the data. Parameters @@ -1999,7 +1978,7 @@ class Imputer(TransformerMixin, _SetOutputMixin): Parameters ---------- - strat_num: str, int or float, default="drop" + strat_num: str, int or float, default="mean" Imputing strategy for numerical columns. Choose from: - "drop": Drop rows containing missing values. @@ -2019,7 +1998,7 @@ class Imputer(TransformerMixin, _SetOutputMixin): of column. - int or float: Impute with provided numerical value. - strat_cat: str, default="drop" + strat_cat: str, default="most_frequent" Imputing strategy for categorical columns. Choose from: - "drop": Drop rows containing missing values. @@ -2145,8 +2124,8 @@ class Imputer(TransformerMixin, _SetOutputMixin): def __init__( self, - strat_num: Scalar | NumericalStrats = "drop", - strat_cat: str | CategoricalStrats = "drop", + strat_num: Scalar | NumericalStrats = "mean", + strat_cat: str | CategoricalStrats = "most_frequent", *, max_nan_rows: FloatLargerZero | None = None, max_nan_cols: FloatLargerZero | None = None, @@ -2169,7 +2148,7 @@ def __init__( self.max_nan_cols = max_nan_cols @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: """Fit to data. Parameters @@ -2299,8 +2278,8 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> def transform( self, X: DataFrame, - y: Pandas | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + y: Tabular | None = None, + ) -> Tabular | tuple[DataFrame, Tabular]: """Impute the missing values. Note that leaving y=None can lead to inconsistencies in @@ -2313,11 +2292,11 @@ def transform( Feature set with shape=(n_samples, n_features). y: int, str, dict, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -2576,7 +2555,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: """Fit to data. Parameters @@ -2636,7 +2615,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: return self @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Apply the transformations to the data. Parameters @@ -2661,7 +2640,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: return X[self.feature_names_in_] @composed(crash, method_to_log) - def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def inverse_transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Apply the inverse transformation to the data. Parameters @@ -2854,8 +2833,8 @@ def __init__( def transform( self, X: DataFrame, - y: Pandas | None = None, - ) -> Pandas | tuple[DataFrame, Pandas]: + y: Tabular | None = None, + ) -> Tabular | tuple[DataFrame, Tabular]: """Apply the outlier strategy on the data. Parameters @@ -2864,11 +2843,11 @@ def transform( Feature set with shape=(n_samples, n_features). y: int, str, dict, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -2995,9 +2974,9 @@ def transform( else: # Replace the columns in X and y with the new values from objective X.update(objective) - if isinstance(y, series_t) and y.name in objective: + if isinstance(y, pd.Series) and y.name in objective: y.update(objective[str(y.name)]) - elif isinstance(y, dataframe_t): + elif isinstance(y, pd.DataFrame): y.update(objective) return variable_return(X, y) @@ -3129,7 +3108,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: """Fit to data. Parameters @@ -3177,7 +3156,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: return self @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Perform standardization by centering and scaling. Parameters @@ -3202,7 +3181,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: return X @composed(crash, method_to_log) - def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def inverse_transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Apply the inverse transformation to the data. Parameters diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index 16e745f84..fa5896d69 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -12,7 +12,6 @@ from typing import Any, Literal import featuretools as ft -import joblib import numpy as np import pandas as pd from beartype import beartype @@ -34,13 +33,13 @@ from atom.basetransformer import BaseTransformer from atom.data_cleaning import Scaler, TransformerMixin from atom.utils.types import ( - Backend, Bool, DataFrame, Engine, FeatureSelectionSolvers, - FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, - FloatZeroToOneInc, IntLargerEqualZero, IntLargerZero, NJobs, Operators, - Pandas, Scalar, Sequence, Series, Verbose, series_t, + Bool, DataFrame, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, + FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, + IntLargerEqualZero, IntLargerZero, NJobs, Operators, Scalar, Sequence, + Series, Tabular, Verbose, ) from atom.utils.utils import ( - Goal, Task, bk, check_is_fitted, check_scaling, composed, crash, + Goal, Task, check_is_fitted, check_scaling, composed, crash, get_custom_scorer, is_sparse, lst, merge, method_to_log, sign, ) @@ -174,7 +173,7 @@ def __init__( self.from_index = from_index @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Extract the new features. Parameters @@ -195,7 +194,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: if self.from_index: if hasattr(X.index, "to_timestamp"): - Xc = bk.DataFrame(X.index.to_timestamp()) + Xc = pd.DataFrame(X.index.to_timestamp()) order = Xc.columns.tolist() + X.columns.tolist() else: raise ValueError("Unable to convert the index to a timestamp format.") @@ -203,7 +202,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Xc = X.select_dtypes(exclude="number") order = X.columns.tolist() - Xt = bk.DataFrame(index=X.index) + Xt = pd.DataFrame(index=X.index) for name, column in Xc.items(): col_dt = pd.to_datetime( arg=column, @@ -228,7 +227,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: f"{fx.lower()} is not an attribute of pd.Series.dt." ) - if not isinstance(series, series_t): + if not isinstance(series, pd.Series): self._log( f" --> Extracting feature {fx} " "failed. Result is not a Series.dt.", 2, @@ -421,7 +420,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: """Fit to data. Parameters @@ -430,11 +429,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: Feature set with shape=(n_samples, n_features). y: int, str, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -510,7 +509,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: return self @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Generate new features. Parameters @@ -682,7 +681,7 @@ def __init__( self.drop_columns = drop_columns @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Group features. Parameters @@ -921,16 +920,6 @@ class FeatureSelector(TransformerMixin): - "sklearnex" - "cuml" - backend: str, default="loky" - Parallelization backend. Read more in the - [user guide][parallel-execution]. Choose from: - - - "loky": Single-node, process-based parallelism. - - "multiprocessing": Legacy single-node, process-based - parallelism. Less robust than `loky`. - - "threading": Single-node, thread-based parallelism. - - "ray": Multi-node, process-based parallelism. - verbose: int, default=0 Verbosity level of the class. Choose from: @@ -1015,7 +1004,6 @@ def __init__( n_jobs: NJobs = 1, device: str = "cpu", engine: Engine = None, - backend: Backend = "loky", verbose: Verbose = 0, random_state: IntLargerEqualZero | None = None, **kwargs, @@ -1024,7 +1012,6 @@ def __init__( n_jobs=n_jobs, device=device, engine=engine, - backend=backend, verbose=verbose, random_state=random_state, ) @@ -1037,7 +1024,7 @@ def __init__( self.kwargs = kwargs @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: """Fit the feature selector to the data. The univariate, sfm (when model is not fitted), sfs, rfe and @@ -1050,11 +1037,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: Feature set with shape=(n_samples, n_features). y: int, str, sequence, dataframe-like or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput @@ -1393,7 +1380,6 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): **kwargs, ) - with joblib.parallel_backend(backend=self.backend): self._estimator.fit(X, y) else: @@ -1492,7 +1478,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> ) @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Transform the data. Parameters diff --git a/atom/models/classreg.py b/atom/models/classreg.py index 9dfac9c04..50272c83a 100644 --- a/atom/models/classreg.py +++ b/atom/models/classreg.py @@ -22,7 +22,7 @@ from optuna.trial import Trial from atom.basemodel import BaseModel -from atom.utils.types import DataFrame, Pandas, Predictor +from atom.utils.types import Pandas, Predictor from atom.utils.utils import CatBMetric, Goal, LGBMetric, XGBMetric @@ -485,8 +485,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: def _fit_estimator( self, estimator: Predictor, - data: tuple[DataFrame, Pandas], - validation: tuple[DataFrame, Pandas] | None = None, + data: tuple[pd.DataFrame, Pandas], + validation: tuple[pd.DataFrame, Pandas] | None = None, trial: Trial | None = None, ): """Fit the estimator and perform in-training validation. @@ -1675,8 +1675,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: def _fit_estimator( self, estimator: Predictor, - data: tuple[DataFrame, Pandas], - validation: tuple[DataFrame, Pandas] | None = None, + data: tuple[pd.DataFrame, Pandas], + validation: tuple[pd.DataFrame, Pandas] | None = None, trial: Trial | None = None, ): """Fit the estimator and perform in-training validation. @@ -3082,7 +3082,7 @@ class XGBoost(BaseModel): } @property - def trials(self) -> pd.DataFrame: + def trials(self) -> pd.pd.DataFrame: """Overview of the trials' results. This property is only available for models that ran @@ -3132,8 +3132,8 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: def _fit_estimator( self, estimator: Predictor, - data: tuple[DataFrame, Pandas], - validation: tuple[DataFrame, Pandas] | None = None, + data: tuple[pd.DataFrame, Pandas], + validation: tuple[pd.DataFrame, Pandas] | None = None, trial: Trial | None = None, ): """Fit the estimator and perform in-training validation. diff --git a/atom/models/custom.py b/atom/models/custom.py index 6c9e49495..ae10fe3be 100644 --- a/atom/models/custom.py +++ b/atom/models/custom.py @@ -5,7 +5,6 @@ """ -from functools import cached_property from typing import Any from atom.basemodel import BaseModel @@ -56,7 +55,7 @@ def fullname(self) -> str: """Return the estimator's class name.""" return self._est_class.__name__ - @cached_property + @property def _est_class(self) -> type[Predictor]: """Return the estimator's class.""" return self._est diff --git a/atom/nlp.py b/atom/nlp.py index 3e05e57fb..94dccdf39 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -28,7 +28,7 @@ from atom.data_cleaning import TransformerMixin from atom.utils.types import ( - Bool, DataFrame, Engine, FloatLargerZero, Pandas, Sequence, + Bool, DataFrame, Engine, FloatLargerZero, Sequence, Tabular, VectorizerStarts, Verbose, bool_t, ) from atom.utils.utils import ( @@ -194,7 +194,7 @@ def __init__( self.drop_punctuation = drop_punctuation @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Apply the transformations to the data. Parameters @@ -445,7 +445,7 @@ def __init__( self.lemmatize = lemmatize @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Normalize the text. Parameters @@ -665,7 +665,7 @@ def __init__( self.quadgram_freq = quadgram_freq @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Tokenize the text. Parameters @@ -924,7 +924,7 @@ def _get_corpus_columns(self) -> list[str]: ) @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: """Fit to data. Parameters @@ -995,7 +995,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> return np.array(og_columns + self._get_corpus_columns()) @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: """Vectorize the text. Parameters diff --git a/atom/pipeline.py b/atom/pipeline.py index e09c3578f..0a00b0b3b 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -26,8 +26,8 @@ from typing_extensions import Self from atom.utils.types import ( - Bool, DataFrame, Estimator, FHConstructor, Float, Pandas, Scalar, Sequence, - Verbose, XConstructor, YConstructor, + Bool, DataFrame, Estimator, FHConstructor, Float, Scalar, Sequence, + Tabular, Verbose, XConstructor, YConstructor, ) from atom.utils.utils import ( NotFittedError, adjust_verbosity, check_is_fitted, fit_one, @@ -55,6 +55,7 @@ class Pipeline(SkPipeline): and additionally: - Can initialize with an empty pipeline. + - Always returns 'pandas' objects. - Accepts transformers that drop rows. - Accepts transformers that only are fitted on a subset of the provided dataset. @@ -273,7 +274,7 @@ def _fit( X: XConstructor | None = None, y: YConstructor | None = None, routed_params: dict[str, Bunch] | None = None, - ) -> tuple[DataFrame | None, Pandas | None]: + ) -> tuple[DataFrame | None, Tabular | None]: """Get data transformed through the pipeline. Parameters @@ -283,7 +284,7 @@ def _fit( X is ignored. None if the pipeline only uses y. y: dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. routed_params: dict or None, default=None Metadata parameters routed for the fit method. @@ -431,7 +432,7 @@ def fit( X is ignored. y: dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. **params Parameters requested and accepted by steps. Each step must @@ -465,7 +466,7 @@ def fit_transform( X: XConstructor | None = None, y: YConstructor | None = None, **params, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> Tabular | tuple[DataFrame, Tabular]: """Fit the pipeline and transform the data. Call `fit` followed by `transform` on each transformer in the @@ -483,7 +484,7 @@ def fit_transform( if the estimator only uses y. y: dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. **params Parameters requested and accepted by steps. Each step must @@ -524,7 +525,7 @@ def transform( *, filter_train_only: Bool = True, **params, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> Tabular | tuple[DataFrame, Tabular]: """Transform the data. Call `transform` on each transformer in the pipeline. The @@ -541,7 +542,7 @@ def transform( X is ignored. None if the pipeline only uses y. y: dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. filter_train_only: bool, default=True Whether to exclude transformers that should only be used @@ -586,7 +587,7 @@ def inverse_transform( *, filter_train_only: Bool = True, **params, - ) -> Pandas | tuple[DataFrame, Pandas]: + ) -> Tabular | tuple[DataFrame, Tabular]: """Inverse transform for each step in a reverse order. All estimators in the pipeline must implement the @@ -599,7 +600,7 @@ def inverse_transform( X is ignored. None if the pipeline only uses y. y: dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. + Target column(s) corresponding to `X`. filter_train_only: bool, default=True Whether to exclude transformers that should only be used @@ -683,7 +684,7 @@ def predict( X: XConstructor | None = None, fh: FHConstructor | None = None, **params, - ) -> np.ndarray | Pandas: + ) -> np.ndarray | Tabular: """Transform, then predict of the final estimator. Parameters @@ -736,7 +737,7 @@ def predict_interval( X: XConstructor | None = None, *, coverage: Float | Sequence[Float] = 0.9, - ) -> Pandas: + ) -> Tabular: """Transform, then predict_quantiles of the final estimator. Parameters @@ -861,7 +862,7 @@ def predict_quantiles( X: XConstructor | None = None, *, alpha: Float | Sequence[Float] = (0.05, 0.95), - ) -> Pandas: + ) -> Tabular: """Transform, then predict_quantiles of the final estimator. Parameters @@ -894,7 +895,7 @@ def predict_residuals( self, y: YConstructor, X: XConstructor | None = None, - ) -> Pandas: + ) -> Tabular: """Transform, then predict_residuals of the final estimator. Parameters diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py index 5ea7aa3ca..793653bef 100644 --- a/atom/plots/baseplot.py +++ b/atom/plots/baseplot.py @@ -15,6 +15,7 @@ import matplotlib.pyplot as plt import numpy as np +import pandas as pd import plotly.express as px import plotly.graph_objects as go from beartype import beartype @@ -25,9 +26,9 @@ from atom.plots.basefigure import BaseFigure from atom.utils.constants import PALETTE from atom.utils.types import ( - Bool, DataFrame, FloatLargerZero, FloatZeroToOneExc, Index, Int, - IntLargerZero, Legend, MetricSelector, Model, ModelsSelector, PlotBackend, - RowSelector, Scalar, Sequence, int_t, sequence_t, + Bool, FloatLargerZero, FloatZeroToOneExc, Int, IntLargerZero, Legend, + MetricSelector, Model, ModelsSelector, PlotBackend, RowSelector, Scalar, + Sequence, int_t, sequence_t, ) from atom.utils.utils import ( Aesthetics, check_is_fitted, composed, crash, get_custom_scorer, lst, @@ -139,7 +140,7 @@ def marker_size(self, value: FloatLargerZero): # Methods ====================================================== >> @staticmethod - def _get_plot_index(df: DataFrame) -> Index: + def _get_plot_index(df: pd.DataFrame) -> pd.Index: """Return the dataset's index in a plottable format. Plotly does not accept all index formats (e.g., pd.Period), diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py index d23499cfe..038e7f86d 100644 --- a/atom/plots/hyperparametertuningplot.py +++ b/atom/plots/hyperparametertuningplot.py @@ -14,6 +14,7 @@ from typing import Any import numpy as np +import pandas as pd import plotly.graph_objects as go from optuna.importance import FanovaImportanceEvaluator from optuna.trial import TrialState @@ -32,7 +33,7 @@ int_t, segment_t, ) from atom.utils.utils import ( - bk, check_dependency, crash, divide, get_segment, it, lst, rnd, + check_dependency, crash, divide, get_segment, it, lst, rnd, ) @@ -244,8 +245,8 @@ def plot_edf( models_c = self._check_hyperparams(models_c) metric_c = self._get_metric(metric) - x_min = bk.concat([m.trials[metric_c] for m in models_c]).min(axis=None) - x_max = bk.concat([m.trials[metric_c] for m in models_c]).max(axis=None) + x_min = pd.concat([m.trials[metric_c] for m in models_c]).min(axis=None) + x_max = pd.concat([m.trials[metric_c] for m in models_c]).max(axis=None) x = np.linspace(x_min, x_max, 100) self._get_figure() diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py index e3dcf6a68..016d4bced 100644 --- a/atom/plots/predictionplot.py +++ b/atom/plots/predictionplot.py @@ -39,10 +39,10 @@ Bool, ColumnSelector, FloatZeroToOneExc, Int, IntLargerEqualZero, IntLargerFour, IntLargerZero, Kind, Legend, MetricConstructor, MetricSelector, ModelsSelector, RowSelector, Sequence, TargetSelector, - TargetsSelector, XSelector, index_t, + TargetsSelector, XSelector, ) from atom.utils.utils import ( - Task, bk, check_canvas, check_dependency, check_empty, check_predict_proba, + Task, check_canvas, check_dependency, check_empty, check_predict_proba, crash, divide, get_custom_scorer, has_task, lst, rnd, ) @@ -832,7 +832,7 @@ def plot_errors( from atom.models import OrdinaryLeastSquares model = OrdinaryLeastSquares(goal=self._goal) - estimator = model._get_est({}).fit(bk.DataFrame(y_true), y_pred) + estimator = model._get_est({}).fit(pd.DataFrame(y_true), y_pred) self._draw_line( x=(x := np.linspace(y_true.min(), y_true.max(), 100)), @@ -1233,7 +1233,7 @@ def plot_forecast( for m in models_c: if X is not None: X = m.transform(X) - elif isinstance(fh, index_t): + elif isinstance(fh, pd.Index): X = m.branch._all.loc[fh] # Draw predictions and interval @@ -1887,7 +1887,7 @@ class is always the positive one. data = data.sample(500, random_state=self.random_state) explanation = m._shap.get_explanation(data, target_c) - shap = bk.DataFrame(explanation.values, columns=m.branch.features) + shap = pd.DataFrame(explanation.values, columns=m.branch.features) parshap[ds] = pd.Series(index=fxs, dtype=float) for fx in fxs: @@ -2134,7 +2134,7 @@ def plot_partial_dependence( axes.append((xaxis, yaxis)) # Compute averaged predictions - predictions = Parallel(n_jobs=self.n_jobs, backend=self.backend)( + predictions = Parallel(n_jobs=self.n_jobs)( delayed(partial_dependence)( estimator=m.estimator, X=m.branch.X_test, diff --git a/atom/training.py b/atom/training.py index 6b0fbe8fe..8cd5a97e0 100644 --- a/atom/training.py +++ b/atom/training.py @@ -371,6 +371,7 @@ class DirectClassifier(Direct): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -605,6 +606,7 @@ class DirectForecaster(Direct): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -835,6 +837,7 @@ class DirectRegressor(Direct): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -1075,6 +1078,7 @@ class SuccessiveHalvingClassifier(SuccessiveHalving): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -1312,6 +1316,7 @@ class SuccessiveHalvingForecaster(SuccessiveHalving): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -1546,6 +1551,7 @@ class SuccessiveHalvingRegressor(SuccessiveHalving): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -1792,6 +1798,7 @@ class TrainSizingClassifier(TrainSizing): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -2035,6 +2042,7 @@ class TrainSizingForecaster(TrainSizing): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the @@ -2274,6 +2282,7 @@ class TrainSizingRegressor(TrainSizing): parallelism. Less robust than `loky`. - "threading": Single-node, thread-based parallelism. - "ray": Multi-node, process-based parallelism. + - "dask": Multi-node, process-based parallelism. memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the diff --git a/atom/utils/types.py b/atom/utils/types.py index f9a674aa3..da79e1f08 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -16,6 +16,8 @@ import modin.pandas as md import numpy as np import pandas as pd +import polars as pl +import pyarrow as pa import scipy.sparse as sps from beartype.door import is_bearable from beartype.typing import Protocol @@ -25,7 +27,8 @@ if TYPE_CHECKING: - from atom.utils.utils import ClassMap, Goal + from atom.branch.dataengines import DataEngine + from atom.utils.utils import Goal # Classes for type hinting ========================================= >> @@ -117,6 +120,13 @@ def __repr__(self) -> str: """Print representation as dictionary.""" return self._asdict().__repr__() + @property + def data_engine(self) -> DataEngine: + """Return the data engine.""" + from atom.branch.dataengines import DATA_ENGINES + + return DATA_ENGINES[self.data]() + class SPTuple(NamedTuple): """Return type of the `sp` parameter.""" @@ -126,6 +136,11 @@ class SPTuple(NamedTuple): trend_model: SeasonalityModels = "additive" +@runtime_checkable +class DataFrame(Protocol): + def __dataframe__(self, *args, **kwargs): ... + + @runtime_checkable class SkScorer(Protocol): """Protocol for sklearn's scorers.""" @@ -177,10 +192,10 @@ class Model(Protocol): """Protocol for all models.""" _goal: Goal - _metric: ClassMap - _ht: dict[str, Any] + # _metric: ClassMap + # _ht: dict[str, Any] - def predict(self, *args, **kwargs) -> Pandas: ... + def predict(self, *args, **kwargs) -> Tabular: ... # Variable types for type hinting ================================== >> @@ -190,11 +205,10 @@ def predict(self, *args, **kwargs) -> Pandas: ... Int: TypeAlias = int | np.integer Float: TypeAlias = float | np.floating Scalar: TypeAlias = Int | Float -Segment: TypeAlias = range | slice -Index: TypeAlias = pd.Index | md.Index -Series: TypeAlias = pd.Series | md.Series -DataFrame: TypeAlias = pd.DataFrame | md.DataFrame -Pandas: TypeAlias = Series | DataFrame +Segment: TypeAlias = slice | range +Series: TypeAlias = pd.Series | md.Series | pl.Series | pa.Array +Pandas: TypeAlias = pd.Series | pd.DataFrame +Tabular: TypeAlias = Series | DataFrame # Numerical types IntLargerZero: TypeAlias = Annotated[Int, Is[lambda x: x > 0]] @@ -222,7 +236,7 @@ def predict(self, *args, **kwargs) -> Pandas: ... FHConstructor: TypeAlias = Int | Sequence[Int] | ForecastingHorizon # Return types for transform methods -TReturn: TypeAlias = np.ndarray | sps.spmatrix | Series | DataFrame +TReturn: TypeAlias = np.ndarray | sps.spmatrix | Sequence[Any] | DataFrame TReturns: TypeAlias = TReturn | tuple[TReturn, TReturn] # Selection of rows or columns by name or position @@ -248,10 +262,21 @@ def predict(self, *args, **kwargs) -> Pandas: ... # BaseTransformer parameters NJobs: TypeAlias = Annotated[Int, Is[lambda x: x != 0]] -EngineDataOptions: TypeAlias = Literal["pandas", "pyarrow", "modin"] +EngineDataOptions: TypeAlias = Literal[ + "numpy", + "pandas", + "pandas-pyarrow", + "polars", + "polars-lazy", + "pyarrow", + "modin", + "dask", + "pyspark", + "pyspark-pandas", +] EngineEstimatorOptions: TypeAlias = Literal["sklearn", "sklearnex", "cuml"] Engine: TypeAlias = EngineDataOptions | EngineEstimatorOptions | EngineDict | EngineTuple | None -Backend: TypeAlias = Literal["loky", "multiprocessing", "threading", "ray"] +Backend: TypeAlias = Literal["loky", "multiprocessing", "threading", "ray", "dask"] Warnings: TypeAlias = Literal["default", "error", "ignore", "always", "module", "once"] Severity: TypeAlias = Literal["debug", "info", "warning", "error", "critical"] Verbose: TypeAlias = Literal[0, 1, 2] @@ -370,8 +395,5 @@ def predict(self, *args, **kwargs) -> Pandas: ... int_t = (int, np.integer) float_t = (float, np.floating) segment_t = (slice, range) -index_t = (pd.Index, md.Index) -series_t = (pd.Series, md.Series) -sequence_t = (range, list, tuple, np.ndarray, *index_t, *series_t) -dataframe_t = (pd.DataFrame, md.DataFrame) -pandas_t = (*series_t, *dataframe_t) +sequence_t = (range, list, tuple, np.ndarray, pd.Index, pd.Series) +pandas_t = (pd.Series, pd.DataFrame) diff --git a/atom/utils/utils.py b/atom/utils/utils.py index 364193aa1..5231cffbd 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -8,21 +8,19 @@ from __future__ import annotations import functools -import os import sys import warnings from collections import deque -from collections.abc import Callable, Hashable, Iterator +from collections.abc import Callable, Iterator from contextlib import contextmanager from copy import copy from dataclasses import dataclass from enum import Enum, IntEnum from functools import cached_property, wraps from importlib import import_module -from importlib.util import find_spec from inspect import Parameter, signature from itertools import cycle -from types import GeneratorType, MappingProxyType +from types import GeneratorType, MappingProxyType, ModuleType from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload import mlflow @@ -31,6 +29,8 @@ import numpy as np import pandas as pd import plotly.graph_objects as go +import polars as pl +import pyarrow as pa import scipy.sparse as sps from beartype import beartype from beartype.door import is_bearable @@ -40,7 +40,7 @@ from optuna.study import Study from optuna.trial import FrozenTrial from pandas._libs.missing import NAType -from pandas._typing import Axes, Dtype, DtypeArg +from pandas._typing import Axes, Dtype from pandas.api.types import is_numeric_dtype from shap import Explainer, Explanation from sklearn.base import BaseEstimator @@ -54,11 +54,11 @@ from atom.utils.constants import __version__ from atom.utils.types import ( - Bool, DataFrame, Estimator, FeatureNamesOut, Float, Index, IndexSelector, - Int, IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar, - Scorer, Segment, Sequence, Series, SPTuple, Transformer, TReturn, TReturns, - Verbose, XConstructor, XSelector, YConstructor, YSelector, dataframe_t, - int_t, pandas_t, segment_t, sequence_t, series_t, + Bool, DataFrame, Estimator, FeatureNamesOut, Float, IndexSelector, Int, + IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar, + Scorer, Segment, Sequence, SPTuple, Tabular, Transformer, TReturn, + TReturns, Verbose, XConstructor, XSelector, YConstructor, YSelector, int_t, + segment_t, sequence_t, ) @@ -69,14 +69,13 @@ T = TypeVar("T") -T_Pandas = TypeVar("T_Pandas", Series, DataFrame) +T_Pandas = TypeVar("T_Pandas", pd.Series, pd.DataFrame) T_Transformer = TypeVar("T_Transformer", bound=Transformer) T_Estimator = TypeVar("T_Estimator", bound=Estimator) # Classes ========================================================== >> - class NotFittedError(ValueError, AttributeError): """Exception called when the instance is not yet fitted. @@ -93,7 +92,7 @@ class Goal(Enum): regression = 1 forecast = 2 - def infer_task(self, y: Pandas) -> Task: + def infer_task(self, y: Tabular) -> Task: """Infer the task corresponding to a target column. Parameters @@ -108,17 +107,17 @@ def infer_task(self, y: Pandas) -> Task: """ if self.value == 1: - if isinstance(y, series_t): + if isinstance(y, pd.Series): return Task.regression else: return Task.multioutput_regression elif self.value == 2: - if isinstance(y, series_t): + if isinstance(y, pd.Series): return Task.univariate_forecast else: return Task.multivariate_forecast - if isinstance(y, dataframe_t): + if isinstance(y, pd.DataFrame): if all(y[col].nunique() == 2 for col in y.columns): return Task.multilabel_classification else: @@ -207,10 +206,10 @@ class SeasonalPeriod(IntEnum): class DataContainer: """Stores a branch's data.""" - data: DataFrame # Complete dataset - train_idx: Index # Indices in the train set - test_idx: Index # Indices in the test - n_cols: Int # Number of target columns + data: pd.DataFrame # Complete dataset + train_idx: pd.Index # Indices in the train set + test_idx: pd.Index # Indices in the test + n_targets: int # Number of target columns @dataclass @@ -254,7 +253,7 @@ class DataConfig: test_size: Scalar = 0.2 holdout_size: Scalar | None = None - def get_stratify_columns(self, df: DataFrame, y: Pandas) -> DataFrame | None: + def get_stratify_columns(self, df: DataFrame, y: Tabular) -> DataFrame | None: """Get columns to stratify by. Parameters @@ -263,7 +262,7 @@ def get_stratify_columns(self, df: DataFrame, y: Pandas) -> DataFrame | None: Dataset from which to get the columns. y: series or dataframe - Target column. + Target column(s). Returns ------- @@ -302,26 +301,6 @@ def get_stratify_columns(self, df: DataFrame, y: Pandas) -> DataFrame | None: return df[inc] -class PandasModin: - """Utility class to select the right data engine. - - Returns pandas or modin depending on the env variable - ATOM_DATA_ENGINE, which is set in BaseTransformer.py. - - """ - - def __getattr__(self, item: str) -> Any: - """Return the backend engine.""" - if os.environ.get("ATOM_DATA_ENGINE") == "modin": - return getattr(md, item) - else: - return getattr(pd, item) - - -# ATOM uses this instance to access the data engine -bk = PandasModin() - - class CatBMetric: """Custom evaluation metric for the CatBoost model. @@ -1009,10 +988,10 @@ def get_explanation( ) from None # Remember shap values in the _shap_values attribute - self._shap_values = bk.concat( + self._shap_values = pd.concat( [ self._shap_values, - bk.Series(list(self._explanation.values), index=calculate.index), + pd.Series(list(self._explanation.values), index=calculate.index), ] ) @@ -1347,9 +1326,9 @@ def merge(*args) -> DataFrame: """ if len(args_c := [x for x in args if x is not None and not x.empty]) == 1: - return bk.DataFrame(args_c[0]) + return pd.DataFrame(args_c[0]) else: - return bk.DataFrame(bk.concat(args_c, axis=1)) + return pd.DataFrame(pd.concat(args_c, axis=1)) def replace_missing(X: T_Pandas, missing_values: list[Any] | None = None) -> T_Pandas: @@ -1393,7 +1372,7 @@ def get_nan(dtype: Dtype) -> float | NAType: # Always convert these values default_values = [None, pd.NA, pd.NaT, np.NaN, np.inf, -np.inf] - if isinstance(X, series_t): + if isinstance(X, pd.Series): return X.replace( to_replace=(missing_values or []) + default_values, value=get_nan(X.dtype), @@ -1405,30 +1384,76 @@ def get_nan(dtype: Dtype) -> float | NAType: ) -def get_cols(elem: Pandas) -> list[Series]: +def n_cols(obj: XSelector | YSelector) -> int: + """Get the number of columns in a dataset. + + Parameters + ---------- + obj: dict, sequence or dataframe-like + Dataset to check. + + Returns + ------- + int or None + Number of columns. + + """ + if hasattr(obj, "shape"): + return obj.shape[1] if len(obj.shape) > 1 else 1 + elif isinstance(obj, dict): + return len(obj) + elif (array := np.asarray(obj)).ndim > 1: + return array.shape[1] + else: + return array.ndim + + +def get_cols(obj: Pandas) -> list[pd.Series]: """Get a list of columns in dataframe / series. Parameters ---------- - elem: series or dataframe + obj: series or dataframe Element to get the columns from. Returns ------- list of series - Columns in elem. + Columns. """ - if isinstance(elem, series_t): - return [elem] + if isinstance(obj, pd.Series): + return [obj] else: - return [elem[col] for col in elem.columns] + return [obj[col] for col in obj.columns] + + +def get_col_names(obj: Tabular | None) -> list[str] | None: + """Get a list of column names in tabular objects. + + Parameters + ---------- + obj: series, dataframe or None + Element to get the column names from. + + Returns + ------- + list of str + Names of the columns. + + """ + if isinstance(obj, pd.DataFrame): + return list(obj.columns) + elif isinstance(obj, pd.Series): + return [obj.name] + else: + return None def variable_return( X: DataFrame | None, y: Series | None, -) -> DataFrame | Series | tuple[DataFrame, Pandas]: +) -> DataFrame | Series | tuple[DataFrame, Tabular]: """Return one or two arguments depending on which is None. This utility is used to make methods return only the provided @@ -1440,7 +1465,7 @@ def variable_return( Feature set. y: series, dataframe or None - Target column. + Target column(s). Returns ------- @@ -1479,7 +1504,7 @@ def get_segment(obj: list[T], segment: Segment) -> list[T]: return obj[slice(segment.start, segment.stop, segment.step)] -def is_sparse(obj: Pandas) -> bool: +def is_sparse(obj: Tabular) -> bool: """Check if the dataframe is sparse. A data set is considered sparse if any of its columns is sparse. @@ -1498,13 +1523,13 @@ def is_sparse(obj: Pandas) -> bool: return any(isinstance(col.dtype, pd.SparseDtype) for col in get_cols(obj)) -def check_empty(obj: Pandas) -> Pandas | None: +def check_empty(obj: Tabular) -> Tabular | None: """Check if a pandas object is empty. Parameters ---------- obj: series or dataframe - Pandas object to check. + Tabular object to check. Returns ------- @@ -1512,24 +1537,34 @@ def check_empty(obj: Pandas) -> Pandas | None: Same object or None if empty. """ - return obj if isinstance(obj, dataframe_t) and not obj.empty else None + return obj if isinstance(obj, pd.DataFrame) and not obj.empty else None + +def check_dependency(name: str) -> ModuleType: + """Check an optional dependency. -def check_dependency(name: str): - """Raise an error if a package is not installed. + Import the module or raise an error if the package is not + installed. Parameters ---------- name: str Name of the package to check. + Returns + ------- + module + Imported module. + """ - if not find_spec(name.replace("-", "_")): + try: + return import_module(name) + except ModuleNotFoundError as ex: raise ModuleNotFoundError( f"Unable to import the {name} package. Install it using " f"`pip install {name}` or install all of atom's optional " "dependencies with `pip install atom-ml[full]`." - ) + ) from ex def check_nltk_module(module: str, *, quiet: bool): @@ -1591,45 +1626,29 @@ def check_predict_proba(models: Model | Sequence[Model], method: str): ) -def check_scaling(X: Pandas, pipeline: Any | None = None) -> bool: +def check_scaling(X: Tabular) -> bool: """Check if the data is scaled. A data set is considered scaled when the mean of the mean of all columns lies between -0.05 and 0.05 and the mean of the standard deviation of all columns lies between 0.85 and 1.15. - Binary columns are excluded from the calculation. - - Additionally, if a pipeline is provided and there's a scaler in - the pipeline, it also returns False. + Categorical and binary columns are excluded from the calculation. Parameters ---------- X: series or dataframe Data set to check. - pipeline: Pipeline or None, default=None - Pipeline in which to check for a scaler (any estimator whose - name contains the word scaler). - Returns ------- bool Whether the data set is scaled. """ - has_scaler = False - if pipeline is not None: - has_scaler = any("scaler" in name.lower() for name in pipeline.named_steps) - - df = to_df(X) # Convert to dataframe - df = df.loc[:, (~df.isin([0, 1])).any(axis=0)] # Remove binary columns - - if df.empty: # All columns are binary -> no scaling needed - return True - else: - mean = df.mean(numeric_only=True).mean() - std = df.std(numeric_only=True).mean() - return has_scaler or bool(-0.05 < mean < 0.05 and 0.85 < std < 1.15) + df = to_df(X) + mean = df.mean(numeric_only=True).mean() + std = df.std(numeric_only=True).mean() + return bool(-0.05 < mean < 0.05 and 0.85 < std < 1.15) @contextmanager @@ -1760,61 +1779,11 @@ def time_to_str(t: Scalar) -> str: return f"{h:02.0f}h:{m:02.0f}m:{s:02.0f}s" -def n_cols(data: XSelector | YSelector) -> int: - """Get the number of columns in a dataset. - - Parameters - ---------- - data: sequence or dataframe-like - Dataset to check. - - Returns - ------- - int or None - Number of columns. - - """ - if (array := np.array(data, dtype="object")).ndim > 1: - return array.shape[1] - else: - return array.ndim # Can be zero when input is a dict - - -def to_pyarrow(column: Series, *, inverse: bool = False) -> Dtype: - """Get the pyarrow dtype corresponding to a series. - - Parameters - ---------- - column: series - Column to get the dtype from. If it already has a pyarrow - dtype, return the original dtype. - - inverse: bool, default=False - Whether to convert to pyarrow or back from pyarrow. - - Returns - ------- - str - Name of the converted dtype. - - """ - if not inverse and not column.dtype.name.endswith("[pyarrow]"): - if column.dtype.name == "object": - return "string[pyarrow]" # pyarrow doesn't support 'object' - else: - return f"{column.dtype.name}[pyarrow]" - elif inverse and column.dtype.name.endswith("[pyarrow]"): - return column.dtype.name[:-9] - - return column.dtype.name - - @overload def to_df( data: Literal[None], index: Axes | None = ..., columns: Axes | None = ..., - dtype: DtypeArg | None = ..., ) -> None: ... @@ -1823,23 +1792,21 @@ def to_df( data: XSelector, index: Axes | None = ..., columns: Axes | None = ..., - dtype: DtypeArg | None = ..., -) -> DataFrame: ... +) -> pd.DataFrame: ... def to_df( data: XSelector | None, index: Axes | None = None, columns: Axes | None = None, - dtype: DtypeArg | None = None, -) -> DataFrame | None: - """Convert a dataset to a dataframe. +) -> pd.DataFrame | None: + """Convert a dataset to a pandas dataframe. Parameters ---------- data: dataframe-like or None - Dataset to convert to a dataframe. If None or already a - dataframe, return unchanged. + Dataset to convert to a dataframe. If None or already a + pandas dataframe, return unchanged. index: sequence, index or None, default=None Values for the index. @@ -1847,53 +1814,46 @@ def to_df( columns: sequence or None, default=None Name of the columns. Use None for automatic naming. - dtype: str, dict, np.dtype or None, default=None - Data types for the output columns. If None, the types are - inferred from the data. - Returns ------- dataframe or None - Dataset as dataframe of a type given by the backend. + Dataset as dataframe. """ - if data is not None: - if not isinstance(data, bk.DataFrame): - # Assign default column names (dict already has column names) - if not isinstance(data, dict | Pandas) and columns is None: + if not isinstance(data, pd.DataFrame | None): + if hasattr(data, "__dataframe__"): + # Transform from dataframe interchange protocol + data_c = pd.api.interchange.from_dataframe(data.__dataframe__()) + else: + # Assign default column names (dict and series already have names) + if columns is None and not isinstance(data, dict | pd.Series): columns = [f"x{i}" for i in range(n_cols(data))] - if hasattr(data, "to_pandas") and bk.__name__ == "pandas": - # Convert cuML to pandas - data_c = data.to_pandas() # type: ignore[operator] - elif sps.issparse(data): - data_c = pd.DataFrame.sparse.from_spmatrix( - data=data, - index=index, - columns=columns, - ) + if sps.issparse(data): + data_c = pd.DataFrame.sparse.from_spmatrix(data, index, columns) else: - data_c = pd.DataFrame(data, index, columns) # type: ignore[arg-type, misc] - else: - data_c = data - - if dtype is not None: - data_c = data_c.astype(dtype) - - if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow": - data_c = data_c.astype({n: to_pyarrow(col) for n, col in data_c.items()}) + data_c = pd.DataFrame(data, index, columns) + else: + data_c = data - return data_c + if data_c is not None and columns is not None: + # Reorder columns to the provided order + try: + data_c = data_c[list(columns)] # Force order determined by columns + except KeyError: + raise ValueError( + f"The columns are different than seen at fit time. " + f"Features {set(data_c.columns) - set(columns)} are missing in X." + ) from None - return data + return data_c @overload def to_series( data: Literal[None], index: Axes | None = ..., - name: Hashable | None = ..., - dtype: Dtype | None = ..., + name: str | None = ..., ) -> None: ... @@ -1901,23 +1861,22 @@ def to_series( def to_series( data: dict[str, Any] | Sequence[Any], index: Axes | None = ..., - name: Hashable | None = ..., - dtype: Dtype | None = ..., -) -> Series: ... + name: str | None = ..., +) -> pd.Series: ... def to_series( data: dict[str, Any] | Sequence[Any] | None, index: Axes | None = None, - name: Hashable | None = None, - dtype: Dtype | None = None, -) -> Series | None: - """Convert a sequence to a series. + name: str | None = None, +) -> pd.Series | None: + """Convert a sequence to a pandas series. Parameters ---------- data: dict, sequence or None - Data to convert. If None, return unchanged. + Data to convert. If None or already a pandas series, return + unchanged. index: sequence, index or None, default=None Values for the index. @@ -1925,67 +1884,54 @@ def to_series( name: str or None, default=None Name of the series. - dtype: str, np.dtype or None, default=None - Data type for the output series. If None, the type is - inferred from the data. - Returns ------- series or None Sequence as series of a type given by the backend. """ - if data is not None: - if not isinstance(data, bk.Series): - if hasattr(data, "to_pandas") and bk.__name__ == "pandas": - data_c = data.to_pandas() # Convert cuML to pandas - else: - # Flatten for arrays with shape (n_samples, 1), sometimes returned by cuML - data_c = pd.Series( # type: ignore[misc] - data=np.array(data, dtype="object").ravel().tolist(), - index=index, - name=getattr(data, "name", name), - dtype=dtype, # type: ignore[arg-type] - ) - else: + if not isinstance(data, pd.Series | None): + if isinstance(data, md.Series): data_c = data + elif isinstance(data, pl.Series): + data_c = data.to_pandas(use_pyarrow_extension_array=True) + elif isinstance(data, pa.Array | pa.ChunkedArray): + data_c = data.to_pandas(types_mapper=pd.ArrowDtype) + else: + # Flatten for arrays with shape=(n_samples, 1) + data_c = pd.Series( + data=np.asarray(data).ravel().tolist(), + index=index, + name=name or "target", + ) + else: + data_c = data - if os.environ.get("ATOM_DATA_ENGINE") == "pyarrow": - data_c = data_c.astype(to_pyarrow(data_c)) - - return data_c - - return data + return data_c @overload -def to_pandas( +def to_tabular( data: Literal[None], index: Axes | None = ..., columns: Axes | None = ..., - name: str | None = ..., - dtype: DtypeArg | None = ..., ) -> None: ... @overload -def to_pandas( +def to_tabular( data: YConstructor, index: Axes | None = ..., columns: Axes | None = ..., - name: str | None = ..., - dtype: DtypeArg | None = ..., -) -> Pandas: ... +) -> Tabular: ... -def to_pandas( +def to_tabular( data: YConstructor | None, index: Axes | None = None, columns: Axes | None = None, - name: str | None = None, - dtype: DtypeArg | None = None, -) -> Pandas | None: - """Convert a sequence or dataset to a dataframe or series object. +) -> Tabular | None: + """Convert to a tabular pandas type. If the data is one-dimensional, convert to series, else to a dataframe. @@ -2001,23 +1947,19 @@ def to_pandas( columns: sequence or None, default=None Name of the columns. Use None for automatic naming. - name: str or None, default=None - Name of the series. - - dtype: str, dict, np.dtype or None, default=None - Data type for the output series. If None, the type is - inferred from the data. - Returns ------- series, dataframe or None - Data as a Pandas object. + Data as a Tabular object. """ - if n_cols(data) == 1: - return to_series(data, index=index, name=name, dtype=dtype) # type: ignore[misc, arg-type] + if (n_targets := n_cols(data)) == 1: + return to_series(data, index=index, name=flt(columns)) else: - return to_df(data, index=index, columns=columns, dtype=dtype) + if columns is None: + columns = [f"y{i}" for i in range(n_targets)] + + return to_df(data, index=index, columns=columns) def check_is_fitted( @@ -2053,26 +1995,6 @@ def check_is_fitted( Whether the estimator is fitted. """ - - def check_attr(attr: str) -> bool: - """Return whether an attribute is False or empty. - - Parameters - ---------- - attr: str - Name of the attribute to check. - - Returns - ------- - bool - Whether the attribute's value is False or empty. - - """ - if isinstance(value := getattr(obj, attr), pandas_t): - return value.empty - else: - return not value - if hasattr(obj, "_is_fitted"): is_fitted = obj._is_fitted else: @@ -2369,15 +2291,8 @@ def fit_one( Feature set with shape=(n_samples, n_features). If None, X is ignored. - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + y: dict, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. message: str or None Short message. If None, nothing will be printed. @@ -2392,7 +2307,7 @@ def fit_one( """ Xt = to_df(X, index=getattr(y, "index", None)) - yt = to_pandas(y, index=getattr(Xt, "index", None)) + yt = to_tabular(y, index=getattr(Xt, "index", None)) with _print_elapsed_time("Pipeline", message): if hasattr(estimator, "fit"): @@ -2439,7 +2354,7 @@ def transform_one( y: YConstructor | None = None, method: Literal["transform", "inverse_transform"] = "transform", **transform_params, -) -> tuple[DataFrame | None, Pandas | None]: +) -> tuple[pd.DataFrame | None, Pandas | None]: """Transform the data using one estimator. Parameters @@ -2451,15 +2366,8 @@ def transform_one( Feature set with shape=(n_samples, n_features). If None, X is ignored. - y: int, str, dict, sequence, dataframe or None, default=None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + y: dict, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. method: str, default="transform" Method to apply: transform or inverse_transform. @@ -2473,33 +2381,31 @@ def transform_one( Feature set. Returns None if not provided. series, dataframe or None - Target column. Returns None if not provided. + Target column(s). Returns None if not provided. """ - def prepare_df(out: TReturn, og: DataFrame) -> DataFrame: - """Convert to df and set correct column names and order. - - If ATOM's data backend="pyarrow", convert the dtypes. + def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: + """Convert to df and set the correct column names. Parameters ---------- - out: np.ndarray, sps.matrix, series or dataframe + out: np.ndarray, sps.matrix or dataframe Data returned by the transformation. - og: dataframe + og: pd.DataFrame Original dataframe, prior to transformations. Returns ------- - dataframe + pd.DataFrame Transformed dataset. """ use_cols = [c for c in inc if c in og.columns] # Convert to pandas and assign proper column names - if not isinstance(out, dataframe_t): + if not isinstance(out, pd.DataFrame): if hasattr(transformer, "get_feature_names_out"): columns = transformer.get_feature_names_out() else: @@ -2520,11 +2426,10 @@ def prepare_df(out: TReturn, og: DataFrame) -> DataFrame: index=getattr(y, "index", None), columns=getattr(transformer, "feature_names_in_", None), ) - yt = to_pandas( + yt = to_tabular( y, index=getattr(Xt, "index", None), columns=getattr(transformer, "target_names_in_", None), - name=flt(getattr(transformer, "target_names_in_", None)), ) use_y = True @@ -2557,27 +2462,25 @@ def prepare_df(out: TReturn, og: DataFrame) -> DataFrame: # Transform can return X, y or both if isinstance(out, tuple): X_new = prepare_df(out[0], Xt) - y_new = to_pandas( + y_new = to_tabular( data=out[1], index=Xt.index, - name=getattr(yt, "name", None), - columns=getattr(yt, "columns", None), + columns=get_col_names(yt), ) - if isinstance(yt, dataframe_t): + if isinstance(yt, pd.DataFrame): y_new = prepare_df(y_new, yt) elif "X" in params and X is not None and any(c in Xt for c in inc): # X in -> X out X_new = prepare_df(out, Xt) y_new = yt if yt is None else yt.set_axis(X_new.index, axis=0) elif y is not None: - y_new = to_pandas( + y_new = to_tabular( data=out, index=yt.index, - name=getattr(yt, "name", None), - columns=getattr(yt, "columns", None), + columns=get_col_names(yt), ) X_new = Xt if Xt is None else Xt.set_index(y_new.index) - if isinstance(yt, dataframe_t): + if isinstance(yt, pd.DataFrame): y_new = prepare_df(y_new, yt) return X_new, y_new @@ -2589,7 +2492,7 @@ def fit_transform_one( y: YConstructor | None, message: str | None = None, **fit_params, -) -> tuple[DataFrame | None, Series | None, Transformer]: +) -> tuple[pd.DataFrame | None, Pandas | None, Transformer]: """Fit and transform the data using one estimator. Estimators without a `transform` method aren't transformed. @@ -2603,15 +2506,8 @@ def fit_transform_one( Feature set with shape=(n_samples, n_features). If None, X is ignored. - y: int, str, dict, sequence, dataframe or None - Target column corresponding to `X`. - - - If None: y is ignored. - - If int: Position of the target column in X. - - If str: Name of the target column in X. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + y: dict, sequence, dataframe-like or None + Target column(s) corresponding to `X`. message: str or None, default=None Short message. If None, nothing will be printed. @@ -2621,11 +2517,11 @@ def fit_transform_one( Returns ------- - dataframe or None + pd.DataFrame or None Feature set. Returns None if not provided. - series or None - Target column. Returns None if not provided. + pd.Series, pd.DataFrame or None + Target column(s). Returns None if not provided. Transformer Fitted transformer. @@ -2788,28 +2684,37 @@ def wrapper( X: XSelector | None = None, y: YSelector | None = None, **kwargs, - ) -> T_Transformer | Pandas | tuple[DataFrame, Pandas]: + ) -> T_Transformer | Tabular | tuple[DataFrame, Tabular]: if f.__name__ == "fit": - Xt, yt = self._check_input(X, y) + Xt = to_df(X, index=getattr(y, "index", None)) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + self._check_feature_names(Xt, reset=True) self._check_n_features(Xt, reset=True) + return f(self, Xt, yt, **kwargs) else: if "TransformerMixin" not in str(self.fit): check_is_fitted(self) - Xt, yt = self._check_input( - X=X, - y=y, + Xt = to_df( + data=X, + index=getattr(y, "index", None), columns=getattr(self, "feature_names_in_", None), - name=getattr(self, "target_names_in_", None), + ) + yt = to_tabular( + y, + index=getattr(Xt, "index", None), + columns=getattr(self, "target_names_in_", None), ) if "y" in sign(f): - return f(self, Xt, yt, **kwargs) + Xt, yt = f(self, Xt, yt, **kwargs) + return self._convert(Xt), self._convert(yt) else: - return f(self, Xt, **kwargs) + Xt = f(self, Xt, **kwargs) + return self._convert(Xt) return wrapper diff --git a/docs_sources/changelog/v6.x.x.md b/docs_sources/changelog/v6.x.x.md index c967f1aaa..c681b188d 100644 --- a/docs_sources/changelog/v6.x.x.md +++ b/docs_sources/changelog/v6.x.x.md @@ -10,6 +10,7 @@ * Support for [Python 3.11](https://www.python.org/downloads/release/python-3110/) and drop support for [Python 3.8](https://www.python.org/downloads/release/python-380/) and [Python 3.9](ttps://www.python.org/downloads/release/python-390/). * New data engines. Read more in the [user guide][data-acceleration]. +* Added the `dask` [parallelization backend][parallel-execution]. * Improved memory optimizations. Read more in the [user guide][memory-considerations]. * Added the `iterative` strategy for [numerical imputation][imputer]. * Added the `hdbscan` strategy to the [Pruner][] class. diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md index fe360d19a..291e317d5 100644 --- a/docs_sources/dependencies.md +++ b/docs_sources/dependencies.md @@ -41,8 +41,10 @@ packages are necessary for its correct functioning. * **[numpy](https://numpy.org/)** (>=1.23.0) * **[optuna](https://optuna.org/)** (>=3.4.0) * **[pandas[parquet]](https://pandas.pydata.org/)** (>=2.1.2) -* **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.9.1) * **[plotly](https://plotly.com/python/)** (>=5.18.0) +* **[polars](https://pola.rs/)** (>=0.20.7) +* **[pyarrow](https://arrow.apache.org/docs/python/)** (>=15.0.0) +* **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.9.1) * **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0) * **[scikit-learn](https://scikit-learn.org/stable/)** (>=1.4.0) * **[scikit-learn-intelex](https://github.com/intel/scikit-learn-intelex)** (>=2023.2.1) @@ -85,6 +87,7 @@ running `pdm install --dev` (remember to install [pdm](https://pdm-project.org/l * **[mypy](https://www.mypy-lang.org/)** (>=1.6.1) * **[pandas_stubs](https://pypi.org/project/pandas-stubs/)** (>=2.1.1.230928) * **[pre-commit](https://pre-commit.com/)** (>=3.5.0) +* **[pyspark-stubs](https://github.com/zero323/pyspark-stubs)** (>=3.0.0) * **[ruff](https://docs.astral.sh/ruff/)** (>=0.1.7) * **[types-requests](https://github.com/python/typeshed)** (>=2.31.0.10) diff --git a/docs_sources/scripts/autodocs.py b/docs_sources/scripts/autodocs.py index efddad058..84e3665e8 100644 --- a/docs_sources/scripts/autodocs.py +++ b/docs_sources/scripts/autodocs.py @@ -626,7 +626,7 @@ def get_table(self, blocks: list) -> str: elif obj.__class__.__name__ == "cached_property": obj = obj.func - # Get the return type. Sometimes it returns a string 'Pandas' + # Get the return type. Sometimes it returns a string 'Tabular' # and sometimes a class pandas.DataFrame. Unclear why output = str(signature(obj).return_annotation) @@ -926,7 +926,7 @@ def types_conversion(dtype: str) -> str: "Pipeline": "[Pipeline][]", "collections.abc.Hashable": "str", "Scalar": "int | float", - "Pandas": "Series | DataFrame", + "Tabular": "Series | DataFrame", "int | numpy.integer": "int", "float | numpy.floating": "float", "Series | modin.pandas.series.Series": "Series", diff --git a/docs_sources/user_guide/accelerating.md b/docs_sources/user_guide/accelerating.md index 371252f2d..1200a2d3f 100644 --- a/docs_sources/user_guide/accelerating.md +++ b/docs_sources/user_guide/accelerating.md @@ -55,32 +55,48 @@ regardless of the engine parameter. ## Data acceleration +ATOM is mostly built around [sklearn](https://scikit-learn.org/stable/) (and [sktime](https://www.sktime.net/en/stable/) for [time series][] +tasks), and both these libraries use numpy as their computation backend. Since +`atom` relies heavily on column names, it uses pandas (which in turn uses numpy) +as its data backend. However, for the convenience of the user, it implements +several data engines, that wraps the data in a different type when called by the +user. This is very similar to sklearn's [set_output](https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html) behaviour, but ATOM +extends this to many more data types. For example, selecting the `polars` data +engine, makes `atom.dataset` return a polars dataframe and `atom.winner.predict(X)` +return a polars series. + The data engine can be specified through the [`engine`][atomclassifier-engine] -parameter, e.g. `#!python engine="pyarrow"` or -`#!python engine={"data": "pyarrow", "estimator": "sklearnex"}` to combine it -with an [estimator engine][estimator acceleration]. ATOM integrates the following -data engines: - -- **pandas**: This is the default data engine. It uses the [`pandas`](https://pandas.pydata.org/docs/index.html) - library with [`numpy`](https://numpy.org/) as backend. -- **pyarrow**: This engine also uses [`pandas`](https://pandas.pydata.org/docs/user_guide/pyarrow.html), but with the [`pyarrow`](https://arrow.apache.org/docs/python/index.html) - backend, instead of `numpy`. PyArrow is a cross-language, platform-independent, - in-memory data format, that provides an efficient and fast way to serialize and - deserialize data. +parameter, e.g. `#!python engine="pyarrow"` or `#!python engine={"data": "pyarrow", +"estimator": "sklearnex"}` to combine it with an [estimator engine][estimator acceleration]. +ATOM integrates the following data engines: + +- **numpy**: Transform the data to a [`numpy`](https://numpy.org/) array. +- **pandas**: Transform the data to [`pandas`](https://pandas.pydata.org/docs/index.html) with `numpy` backend. This + is the default engine and, in almost all cases, leaves the data unchanged. +- **pandas-pyarrow**: Transform the data to [`pandas`](https://pandas.pydata.org/docs/user_guide/pyarrow.html) with the [`pyarrow`](https://arrow.apache.org/docs/python/index.html) + backend. Read more in pandas' [user guide](https://pandas.pydata.org/docs/user_guide/pyarrow.html). +- **polars**: The [polars](https://docs.pola.rs/) library is a blazingly fast dataframe library + implemented in Rust and based on Apache Arrow. Transforms the data to a polars + dataframe or series. +- **polars-lazy**: This engine is similar to the `polars` engine, but it returns + a [pl.LazyFrame](https://docs.pola.rs/py-polars/html/reference/lazyframe/index.html) instead of a [pl.DataFrame](https://docs.pola.rs/py-polars/html/reference/dataframe/index.html). +- **pyarrow**: PyArrow is a cross-language, platform-independent, in-memory data + format, that provides an efficient and fast way to serialize and deserialize data. + the data is transformed to a [pa.Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html) or [pa.Array](https://arrow.apache.org/docs/python/generated/pyarrow.Array.html). - **modin**: The [modin](https://modin.readthedocs.io/en/stable/) library is a multi-threading, drop-in replacement - for pandas, that uses [Ray](https://www.ray.io/) as backend. + for pandas, that uses [Ray](https://www.ray.io/) as backend. Transform the data to a modin dataframe + or series. +- **dask**: The [dask](https://docs.dask.org/en/stable/) library is a powerful Python library for parallel and + distributed computing. Transform the data to a [dask dataframe](https://docs.dask.org/en/latest/dataframe.html) or [dask series](https://docs.dask.org/en/stable/generated/dask.dataframe.Series.html). +- **pyspark**: The [pyspark](https://spark.apache.org/docs/latest/api/python/index.html) library is the Python API for Apache Spark. + Transform the data to a [pyspark dataframe](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html) or [pyspark series](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.Column.html). +- **pyspark-pandas**: Similar to the `pyspark` engine, but it returns pyspark objects + with the [pandas API](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html). !!! note - Although atom accepts a numpy array or a list of lists as input, it - converts the data internally to the specified data engine since its API - requires column names and indices. - -!!! warning - Depending on the data engine, the following limitations apply: - - - The `pyarrow` engine doesn't support [sparse datasets][]. - - The [LightGBM][] and [XGBoost][] models don't support the `pyarrow` engine. - - The `modin` engine is not compatible with [forecast][time-series] tasks. + It's important to realize that, within atom, the data is still processed using + pandas (with the numpy backend). Only when the data is returned to the user, it + is transformed to the selected format. ## Estimator acceleration @@ -238,16 +254,18 @@ parallelization backends. mostly useful when the execution bottleneck is a compiled extension that explicitly releases the GIL (for instance a Cython loop wrapped in a "with nogil" block or an expensive call to a library such as numpy). -* **ray:** [Ray](https://www.ray.io/) is an open-source unified compute framework - that makes it easy to scale AI and Python workloads. Read more about Ray [here](https://docs.ray.io/en/latest/ray-core/walkthrough.html). - See [here][example-ray-backend] an example use case. +* **ray:** [Ray](https://www.ray.io/) is an open-source unified compute framework that makes it + easy to scale AI and Python workloads. Read more about Ray [here](https://docs.ray.io/en/latest/ray-core/walkthrough.html). See + [here][example-ray-backend] an example use case. +* **dask:** [Dask](https://docs.dask.org/en/stable/) is a flexible parallel computing library for analytics. + Read more about Dask [here](https://docs.dask.org/en/stable/10-minutes-to-dask.html). The parallelization backend is applied in the following cases: * In every individual estimator that uses parallelization internally. * To calculate cross-validated results during [hyperparameter tuning][]. -* To train multiple models in parallel (when the trainer's `parallel` parameter is True). +* To train multiple models in parallel (when [`parallel=True`][directclassifier-parallel]). * To calculate partial dependencies in [plot_partial_dependence][]. !!! note diff --git a/docs_sources/user_guide/data_management.md b/docs_sources/user_guide/data_management.md index 0e000bdb2..f95fb2bbb 100644 --- a/docs_sources/user_guide/data_management.md +++ b/docs_sources/user_guide/data_management.md @@ -38,8 +38,8 @@ or together: * X * X, y -Remember to use the `y` parameter to indicate the target column in X when -using the first option. If not specified, the last column in X is used as +Remember to use the `y` parameter to indicate the target column in `X` when +using the first option. If not specified, the last column in `X` is used as the target. In both these cases, the sizes of the sets are defined using the `test_size` and `holdout_size` parameters. Note that the splits are made after the subsample of the dataset with the `n_rows` parameter (when not diff --git a/docs_sources/user_guide/nomenclature.md b/docs_sources/user_guide/nomenclature.md index ef758c094..33bb42c2c 100644 --- a/docs_sources/user_guide/nomenclature.md +++ b/docs_sources/user_guide/nomenclature.md @@ -35,22 +35,22 @@ the target column.
dataframe
-Two-dimensional, size-mutable, potentially heterogeneous tabular data of type -[pd.DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) -or its [modin](https://modin.readthedocs.io/en/stable/flow/modin/pandas/dataframe.html) -counterpart. +Two-dimensional, size-mutable, potentially heterogeneous tabular data. +The type is usually [pd.DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), +but could potentially be any of the dataframe types backed by the +selected [data engine][data-acceleration].
dataframe-like
-Any type object from which a [dataframe][] can be created. This includes an -[iterable](https://docs.python.org/3/glossary.html#term-iterable), a -[dict](https://docs.python.org/3/library/functions.html#func-dict) whose +Any type object from which a [pd.DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) +can be created. This includes an [iterable](https://docs.python.org/3/glossary.html#term-iterable), +a [dict](https://docs.python.org/3/library/functions.html#func-dict) whose values are 1d-arrays, a two-dimensional [list](https://docs.python.org/3/library/functions.html#func-list), [tuple](https://docs.python.org/3/library/functions.html#func-tuple), [np.ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) or [sps.csr_matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html), -and most commonly, a [dataframe][]. This is the standard input format for -any dataset. +or any object that follows the [dataframe interchange protocol](https://data-apis.org/dataframe-protocol/latest/index.html). +This is the standard input format for any dataset. Additionally, you can provide a callable whose output is any of the aforementioned types. This is useful when the dataset is very large and @@ -67,13 +67,6 @@ method. Often used interchangeably with [predictor][] because of user preference.
-
index
-
-Immutable sequence used for indexing and alignment of type [pd.Index](https://pandas.pydata.org/docs/reference/api/pandas.Index.html) -or their [modin](https://modin.readthedocs.io/en/stable/flow/modin/pandas/dataframe.html) -counterpart. -
-
missing values
All values in the [`missing`][atomclassifier-missing] attribute, as @@ -129,10 +122,10 @@ column.
series
-One-dimensional ndarray with axis labels of type -[pd.Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html#pandas.Series) -or its [modin](https://modin.readthedocs.io/en/stable/flow/modin/pandas/series.html) -counterpart. +One-dimensional ndarray with axis labels. The type is usually +[pd.Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html#pandas.Series), +but could potentially be any of the series types backed by the +selected [data engine][data-acceleration].
target
diff --git a/pyproject.toml b/pyproject.toml index aa32035f3..5c3297c80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,8 @@ dependencies = [ "optuna>=3.4.0", "pandas[parquet]>=2.1.2", "plotly>=5.18.0", + "polars>=0.20.7", + "pyarrow>=15.0.0", "ray[serve]>=2.9.1", "requests>=2.31.0", "scikit-learn>=1.4.0", @@ -69,6 +71,7 @@ dev = [ "mypy>=1.6.1", "pandas_stubs>=2.1.1.230928", "pre-commit>=3.5.0", + "pyspark-stubs>=3.0.0", "ruff>=0.1.7", "types-requests>=2.31.0.10", # Testing diff --git a/tests/conftest.py b/tests/conftest.py index 9721308ff..8dd9b3e5b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,7 +23,7 @@ from sktime.split import temporal_train_test_split from atom.data_cleaning import TransformerMixin -from atom.utils.utils import merge, n_cols, to_df, to_pandas +from atom.utils.utils import merge, n_cols, to_df, to_tabular if TYPE_CHECKING: @@ -31,7 +31,7 @@ from _pytest.monkeypatch import MonkeyPatch - from atom.utils.types import DataFrame, Pandas, Sequence, XSelector + from atom.utils.types import DataFrame, Sequence, Tabular, XSelector class DummyTransformer(TransformerMixin, BaseEstimator): @@ -116,7 +116,7 @@ def random(): def get_train_test( X: XSelector | None, y: Sequence[Any] | DataFrame, -) -> Pandas | tuple[Pandas, Pandas]: +) -> Tabular | tuple[Tabular, Tabular]: """Get train and test sets from X and y. Parameters @@ -125,7 +125,7 @@ def get_train_test( Feature set. If None, split as time series data set. y: sequence or DataFrame - Target column corresponding to `X`. + Target column(s) corresponding to `X`. Returns ------- @@ -138,7 +138,7 @@ def get_train_test( """ if X is not None: return train_test_split( - merge(to_df(X), to_pandas(y, columns=[f"y{i}" for i in range(n_cols(y))])), + merge(to_df(X), to_tabular(y, columns=[f"y{i}" for i in range(n_cols(y))])), test_size=0.3, random_state=1, ) diff --git a/tests/test_basetrainer.py b/tests/test_basetrainer.py index e782fdd4b..d0e702be6 100644 --- a/tests/test_basetrainer.py +++ b/tests/test_basetrainer.py @@ -9,7 +9,6 @@ import mlflow import pytest -import ray from mlflow.tracking.fluent import ActiveRun from optuna.distributions import CategoricalDistribution, IntDistribution from optuna.pruners import MedianPruner @@ -383,14 +382,25 @@ def test_parallel_with_ray(): trainer = DirectClassifier( models=["LR", "LDA"], parallel=True, - n_jobs=1, + n_jobs=2, backend="ray", random_state=1, ) - # Fails because Mock returns empty list + # Fails because MagicMock returns empty list with pytest.raises(RuntimeError, match=".*All models failed.*"): trainer.run(bin_train, bin_test) - ray.shutdown() + + +def test_parallel_with_dask(): + """Assert that parallel runs successfully with dask backend.""" + trainer = DirectClassifier( + models=["LR", "LDA"], + parallel=True, + n_jobs=2, + backend="dask", + random_state=1, + ) + trainer.run(bin_train, bin_test) @patch("atom.basetrainer.Parallel", MagicMock()) diff --git a/tests/test_basetransformer.py b/tests/test_basetransformer.py index 3cffdfcd0..b6915b250 100644 --- a/tests/test_basetransformer.py +++ b/tests/test_basetransformer.py @@ -57,23 +57,11 @@ def test_device_parameter(): assert os.environ["CUDA_VISIBLE_DEVICES"] == "0" -@patch("ray.init") -def test_engine_parameter_modin(ray): - """Assert that ray is initialized when modin is data backend.""" - base = BaseTransformer(device="cpu", engine="modin") - assert base.engine.data == "modin" - assert ray.is_called_once - - -def test_engine_parameter_env_var(): - """Assert that the environment variable is set.""" - base = BaseTransformer(device="cpu", engine="pyarrow") - assert base.engine == EngineTuple(data="pyarrow", estimator="sklearn") - assert os.environ["ATOM_DATA_ENGINE"] == base.engine.data - - base = BaseTransformer(device="cpu", engine="sklearnex") - assert base.engine == EngineTuple(data="pandas", estimator="sklearnex") - assert os.environ["ATOM_DATA_ENGINE"] == base.engine.data +@pytest.mark.parametrize("engine", [None, "pandas", "sklearn", {}, EngineTuple()]) +def test_engine_parameter(engine): + """Assert that the engine parameter can be initialized.""" + base = BaseTransformer(engine=engine) + assert base.engine == EngineTuple() @patch.dict("sys.modules", {"sklearnex": None}) @@ -103,6 +91,13 @@ def test_backend_parameter_ray(ray): assert ray.is_called_once +@patch("dask.distributed.Client") +def test_backend_parameter_dask(dask): + """Assert that dask is initialized when selected.""" + BaseTransformer(backend="dask") + assert dask.is_called_once + + def test_backend_parameter(): """Assert that other backends can be specified.""" base = BaseTransformer(backend="threading") diff --git a/tests/test_branch.py b/tests/test_branch.py index 5484168d4..85a65af64 100644 --- a/tests/test_branch.py +++ b/tests/test_branch.py @@ -8,9 +8,11 @@ import os from pathlib import Path +import numpy as np import pandas as pd import pytest from pandas.testing import assert_frame_equal +from sklearn.preprocessing import MinMaxScaler, StandardScaler from atom import ATOMClassifier, ATOMRegressor from atom.branch import Branch, BranchManager @@ -276,14 +278,14 @@ def test_data_properties_to_df(): """Assert that the data attributes are converted to a df at setter.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) atom.X = X_bin_array - assert isinstance(atom.X, pd.DataFrame) + assert isinstance(atom.branch.X, pd.DataFrame) def test_data_properties_to_series(): """Assert that the data attributes are converted to a series at setter.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) atom.y = y_bin_array - assert isinstance(atom.y, pd.Series) + assert isinstance(atom.branch.y, pd.Series) def test_setter_error_unequal_rows(): @@ -561,6 +563,30 @@ def test_load_no_dir(): atom.branch = "main" +def test_check_scaling_scaler_in_pipeline(): + """Assert that check_scaling returns True when there's a scaler in the pipeline.""" + atom = ATOMClassifier(X_bin, y=y_bin, random_state=1) + assert not atom.branch.check_scaling() + atom.add(MinMaxScaler()) + assert atom.branch.check_scaling() + + +def test_check_scaling(): + """Assert that the check_scaling method returns whether the data is scaled.""" + scaler = StandardScaler() + scaler.__class__.__name__ = "OtherName" + + atom = ATOMClassifier(X_bin, y=y_bin, random_state=1) + atom.add(scaler) + assert atom.branch.check_scaling() + + +def test_check_scaling_drop_binary(): + """Assert that binary rows are dropped to check scaling.""" + atom = ATOMClassifier(np.tile(y10, (10, 1)), y=y10, random_state=1) + assert atom.branch.check_scaling() + + # Test BranchManager =============================================== >> def test_branchmanager_repr(): @@ -665,3 +691,27 @@ def test_reset(): assert len(atom._branches) == 1 assert not glob.glob("joblib/atom/Branch(main).pkl") assert atom.og is atom.branch + + +# Test data engines ================================================ >> + +def test_numpy_engine(): + """Assert that the numpy engine returns a numpy array.""" + atom = ATOMClassifier(X_bin, y_bin, engine="numpy", random_state=1) + assert isinstance(atom.dataset, np.ndarray) + + +def test_pandas_numpy_engine(): + """Assert that the pandas numpy engine returns a pandas dataframe.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pandas", random_state=1) + assert all(isinstance(dtype, np.dtype) for dtype in atom.dataset.dtypes) + assert isinstance(atom.y.dtype, np.dtype) + + +def test_pandas_pyarrow_engine(): + """Assert that the pandas pyarrow engine returns pyarrow dtypes.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pandas-pyarrow", random_state=1) + assert all(isinstance(dtype, pd.ArrowDtype) for dtype in atom.dataset.dtypes) + assert isinstance(atom.y.dtype, pd.ArrowDtype) + + From 8885b7a28333b6650561d15f8d227e8a391a5768 Mon Sep 17 00:00:00 2001 From: Mavs Date: Tue, 13 Feb 2024 21:18:32 +0100 Subject: [PATCH 02/12] set_output --- atom/api.py | 14 ++ atom/atom.py | 44 +++--- atom/basemodel.py | 4 +- atom/baserunner.py | 23 +++- atom/branch/branch.py | 44 ++++-- atom/branch/branchmanager.py | 7 +- atom/branch/dataengines.py | 76 ++++------- atom/data_cleaning.py | 40 +++++- atom/models/classreg.py | 1 - atom/pipeline.py | 41 +++++- atom/utils/utils.py | 77 ++++++----- docs_sources/dependencies.md | 2 +- docs_sources/scripts/autodocs.py | 1 + docs_sources/user_guide/accelerating.md | 2 +- pyproject.toml | 2 +- tests/test_baserunner.py | 174 +++++++++++++++++++++++- tests/test_basetransformer.py | 168 ----------------------- tests/test_branch.py | 64 ++++++++- 18 files changed, 479 insertions(+), 305 deletions(-) diff --git a/atom/api.py b/atom/api.py index 2fc4acdeb..e7a55e98e 100644 --- a/atom/api.py +++ b/atom/api.py @@ -259,9 +259,16 @@ class ATOMClassifier(ATOM): - "data": + - "numpy" - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" - "pyarrow" - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" - "estimator": @@ -531,9 +538,16 @@ class ATOMForecaster(ATOM): - "data": + - "numpy" - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" - "pyarrow" - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" - "estimator": diff --git a/atom/atom.py b/atom/atom.py index a022f6bcd..573be0ba2 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -24,12 +24,9 @@ from beartype import beartype from joblib.memory import Memory from pandas._typing import DtypeObj -from scipy import stats from sklearn.pipeline import Pipeline as SkPipeline from sklearn.utils.metaestimators import available_if -from statsmodels.stats.diagnostic import acorr_ljungbox -from statsmodels.tsa.stattools import adfuller, kpss - +from polars.dependencies import _lazy_import from atom.baserunner import BaseRunner from atom.basetransformer import BaseTransformer from atom.branch import Branch, BranchManager @@ -68,6 +65,11 @@ ) +stats, _ = _lazy_import("scipy.stats") +diagnostic, _ = _lazy_import("statsmodels.stats.diagnostic") +stattools, _ = _lazy_import("statsmodels.tsa.stattools") + + T_Transformer = TypeVar("T_Transformer", bound=Transformer) @@ -497,11 +499,11 @@ def checks(self, *, columns: ColumnSelector | None = None) -> pd.DataFrame: for test in ("adf", "kpss", "lb"): if test == "adf": - stat = adfuller(X, maxlag=None, autolag="AIC") + stat = stattools.adfuller(X, maxlag=None, autolag="AIC") elif test == "kpss": - stat = kpss(X, regression="ct", nlags="auto") # ct is trend stationarity + stat = stattools.kpss(X, regression="ct", nlags="auto") # ct is trend stationarity elif test == "lb": - l_jung = acorr_ljungbox(X, lags=None, period=lst(self.sp.sp)[0]) + l_jung = diagnostic.acorr_ljungbox(X, lags=None, period=lst(self.sp.sp)[0]) stat = l_jung.loc[l_jung["lb_pvalue"].idxmin()] # Add as column to the dataframe @@ -1026,7 +1028,7 @@ def stats(self, _vb: Int = -2, /): self._log(f"Seasonal period: {self.sp.sp}", _vb) for ds in ("train", "test", "holdout"): - if (data := getattr(self, ds)) is not None: + if (data := getattr(self.branch, ds)) is not None: self._log(f"{ds.capitalize()} set size: {len(data)}", _vb) if self.task.is_forecast: self._log(f" --> From: {min(data.index)} To: {max(data.index)}", _vb) @@ -1267,7 +1269,7 @@ def _add_transformer( kwargs = { "estimator": transformer_c, "X": self.branch.X_train, - "y": self.y_train, + "y": self.branch.y_train, **fit_params, } @@ -1302,19 +1304,23 @@ def _add_transformer( data = merge(self.branch.X if X is None else X, self.branch.y if y is None else y) # y can change the number of columns or remove rows -> reassign index - self.branch._container = DataContainer( - data=data, - train_idx=self.branch._data.train_idx.intersection(data.index), - test_idx=self.branch._data.test_idx.intersection(data.index), - n_targets=self.branch._data.n_targets if y is None else n_cols(y), + self._branches.fill( + DataContainer( + data=data, + train_idx=self.branch._data.train_idx.intersection(data.index), + test_idx=self.branch._data.test_idx.intersection(data.index), + n_targets=self.branch._data.n_targets if y is None else n_cols(y), + ) ) if self._config.index is False: - self.branch._container = DataContainer( - data=(data := self.branch.dataset.reset_index(drop=True)), - train_idx=data.index[: len(self.branch._data.train_idx)], - test_idx=data.index[-len(self.branch._data.test_idx):], - n_targets=self.branch._data.n_targets, + self._branches.fill( + DataContainer( + data=(data := self.branch.dataset.reset_index(drop=True)), + train_idx=data.index[: len(self.branch._data.train_idx)], + test_idx=data.index[-len(self.branch._data.test_idx):], + n_targets=self.branch._data.n_targets, + ) ) if self.branch._holdout is not None: self.branch._holdout.index = range( diff --git a/atom/basemodel.py b/atom/basemodel.py index 7a418e847..8d0f83785 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -279,7 +279,7 @@ def __dir__(self) -> list[str]: if "_branch" in self.__dict__: # Add additional attrs from the branch - attrs += Branch._get_data_attrs() + attrs += self.branch._get_shared_attrs() # Add additional attrs from the dataset attrs += [x for x in DF_ATTRS if hasattr(self.dataset, x)] @@ -292,7 +292,7 @@ def __dir__(self) -> list[str]: def __getattr__(self, item: str) -> Any: """Get attributes from branch or data.""" if "_branch" in self.__dict__: - if item in Branch._get_data_attrs(): + if item in self.branch._get_shared_attrs(): return getattr(self.branch, item) # Get attr from branch elif item in self.branch.columns: return self.branch.dataset[item] # Get column diff --git a/atom/baserunner.py b/atom/baserunner.py index ff6d51a63..d05a76e88 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -84,7 +84,7 @@ def __dir__(self) -> list[str]: attrs = [x for x in super().__dir__() if hasattr(self, x)] # Add additional attrs from the branch - attrs += Branch._get_data_attrs() + attrs += self.branch._get_shared_attrs() # Add additional attrs from the dataset attrs += [x for x in DF_ATTRS if hasattr(self.dataset, x)] @@ -105,7 +105,7 @@ def __getattr__(self, item: str) -> Any: """Get branch, attr from branch, model, column or attr from dataset.""" if item in self.__dict__["_branches"]: return self._branches[item] # Get branch - elif item in Branch._get_data_attrs(): + elif item in self.branch._get_shared_attrs(): if isinstance(attr := getattr(self.branch, item), pandas_t): return self._convert(attr) # Transform data through data engine else: @@ -224,7 +224,7 @@ def holdout(self) -> DataFrame | None: the [user guide][data-sets]. """ - return self.branch._holdout + return self._convert(self.branch._holdout) @property def models(self) -> str | list[str] | None: @@ -347,6 +347,9 @@ def _check_input(X: XSelector, y: YSelector) -> tuple[DataFrame, Tabular]: ... def _check_input( X: XSelector | None = None, y: YSelector | None = None, + *, + columns: list[str] | None = None, + name: str | list[str] | None = None, ) -> tuple[DataFrame | None, Tabular | None]: """Prepare the input data. @@ -371,6 +374,14 @@ def _check_input( tasks. - If dataframe: Target columns for multioutput tasks. + columns: list, default=None + Column names for the feature set. If None, default names + are used. + + name: str, default=None + Name of the target column(s). If None, a default name is + used. + Returns ------- dataframe or None @@ -386,7 +397,7 @@ def _check_input( if X is None and y is None: raise ValueError("X and y can't be both None!") elif X is not None: - Xt = to_df(deepcopy(X() if callable(X) else X)) + Xt = to_df(deepcopy(X() if callable(X) else X), columns=columns) # If text dataset, change the name of the column to corpus if list(Xt.columns) == ["x0"] and Xt[Xt.columns[0]].dtype == "object": @@ -402,7 +413,7 @@ def _check_input( # Prepare target column if isinstance(y, (dict, *sequence_t, DataFrame)): if isinstance(y, dict): - yt = to_tabular(deepcopy(y), index=getattr(Xt, "index", None)) + yt = to_tabular(deepcopy(y), index=getattr(Xt, "index", None), columns=name) else: # If X and y have different number of rows, try multioutput if Xt is not None and len(Xt) != len(y): @@ -431,7 +442,7 @@ def _check_input( else: yt = y - yt = to_tabular(deepcopy(yt), index=getattr(Xt, "index", None)) + yt = to_tabular(deepcopy(yt), index=getattr(Xt, "index", None), columns=name) # Check X and y have the same indices if Xt is not None and not Xt.index.equals(yt.index): diff --git a/atom/branch/branch.py b/atom/branch/branch.py index f179fd55b..70fd628c8 100644 --- a/atom/branch/branch.py +++ b/atom/branch/branch.py @@ -13,8 +13,8 @@ from pathlib import Path from typing import Literal, overload from warnings import filterwarnings +from polars.dependencies import _lazy_import -import dill as pickle import pandas as pd from beartype import beartype from beartype.roar import BeartypeDecorHintPep585DeprecationWarning @@ -33,6 +33,9 @@ ) +pickle = _lazy_import("dill") + + filterwarnings("ignore", category=BeartypeDecorHintPep585DeprecationWarning) @@ -100,6 +103,26 @@ class Branch: """ + _shared_attrs = [ + "pipeline", + "mapping", + "dataset", + "train", + "test", + "X", + "y", + "X_train", + "y_train", + "X_test", + "y_test", + "shape", + "columns", + "n_columns", + "features", + "n_features", + "target", + ] + def __init__( self, name: str, @@ -229,10 +252,13 @@ def counter(name: str, dim: str) -> str | None: if under_name := counter(name, "under"): under = getattr(self, under_name) + if (columns := get_col_names(value)) is None: + columns = get_col_names(under) if under_name else None + obj = to_tabular( data=value, index=side.index if side_name else None, - columns=get_col_names(under) if under_name else None, + columns=columns, ) if side_name: # Check for equal rows @@ -439,21 +465,17 @@ def _all(self) -> pd.DataFrame: # Utility methods ============================================== >> - @classmethod - def _get_data_attrs(cls) -> list[str]: - """Get the data attributes of the class. + def _get_shared_attrs(self) -> list[str]: + """Get the attributes that can be accessed from a runner. Returns ------- list of str - Data properties. + Instance attributes. """ - return [ - x - for x in dir(cls) - if isinstance(getattr(cls, x), property) and not x.startswith("_") - ] + instance_vars = [x for x in vars(self) if not x.startswith("_") and x.endswith("_")] + return self._shared_attrs + instance_vars @overload def _get_rows( diff --git a/atom/branch/branchmanager.py b/atom/branch/branchmanager.py index 7a0cd96d6..6e2c41fe8 100644 --- a/atom/branch/branchmanager.py +++ b/atom/branch/branchmanager.py @@ -216,6 +216,8 @@ def add(self, name: str, parent: Branch | None = None): def fill(self, data: DataContainer, holdout: pd.DataFrame | None = None): """Fill the current branch with data. + This call resets the cached holdout calculation. + Parameters ---------- data: DataContainer @@ -226,7 +228,10 @@ def fill(self, data: DataContainer, holdout: pd.DataFrame | None = None): """ self.current._container = data - self.current._holdout = holdout + if holdout is not None: + self.current._holdout = holdout + + self.current.__dict__.pop("holdout", None) def reset(self, *, hard: Bool = False): """Reset this instance to its initial state. diff --git a/atom/branch/dataengines.py b/atom/branch/dataengines.py index 95c118034..26dc2fa4a 100644 --- a/atom/branch/dataengines.py +++ b/atom/branch/dataengines.py @@ -8,17 +8,24 @@ from __future__ import annotations from abc import ABCMeta, abstractmethod +from polars.dependencies import _lazy_import -import dask.dataframe as dd -import modin.pandas as md import numpy as np import pandas as pd import polars as pl -import pyarrow as pa -import pyspark -import pyspark.pandas as ps from atom.utils.types import Any, DataFrame, Pandas, Sequence +from atom.utils.utils import get_cols + +import os + + +os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" + +dd, _ = _lazy_import("dask.dataframe") +md, _ = _lazy_import("modin.pandas") +pa, _ = _lazy_import("pyarrow") +ps, _ = _lazy_import("pyspark") class DataEngine(metaclass=ABCMeta): @@ -53,18 +60,7 @@ class PandasNumpyEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> Pandas: """Convert to numpy dtypes.""" - if isinstance(obj, pd.DataFrame): - return obj.astype( - { - c: t.numpy_dtype - for c, t in obj.dtypes.items() - if hasattr(t, "numpy_dtype") - } - ) - elif hasattr(obj.dtype, "numpy_dtype"): - return obj.astype(obj.dtype.numpy_dtype) - else: - return obj + return obj.astype({c.name: getattr(c.dtype, "numpy_dtype", None) for c in get_cols(obj)}) class PandasPyarrowEngine(DataEngine): @@ -75,18 +71,13 @@ class PandasPyarrowEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> Pandas: """Convert to pyarrow dtypes.""" - if isinstance(obj, pd.DataFrame): - return obj.astype( - { - c: pd.ArrowDtype(pa.from_numpy_dtype(t)) - for c, t in obj.dtypes.items() - if isinstance(t, np.dtype) - } - ) - elif isinstance(obj.dtype, np.dtype): - return obj.astype(pd.ArrowDtype(pa.from_numpy_dtype(obj.dtype))) - else: - return obj + return obj.astype( + { + col.name: pd.ArrowDtype(pa.from_numpy_dtype(col.dtype)) + if isinstance(col.dtype, np.dtype) else None + for col in get_cols(obj) + } + ) class PolarsEngine(DataEngine): @@ -97,8 +88,6 @@ class PolarsEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> pl.Series | pl.DataFrame: """Convert to polars objects.""" - import polars as pl - if isinstance(obj, pd.DataFrame): return pl.DataFrame(obj) elif isinstance(obj, pd.Series): @@ -113,8 +102,6 @@ class PolarsLazyEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> pl.Series | pl.DataFrame: """Convert to lazy polars objects.""" - import polars as pl - if isinstance(obj, pd.DataFrame): return pl.LazyFrame(obj) elif isinstance(obj, pd.Series): @@ -129,8 +116,6 @@ class PyArrowEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> pa.Array | pa.Table: """Convert to pyarrow objects.""" - import pyarrow as pa - if isinstance(obj, pd.DataFrame): return pa.Table.from_pandas(obj) elif isinstance(obj, pd.Series): @@ -145,8 +130,6 @@ class ModinEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> md.Series | md.DataFrame: """Convert to modin objects.""" - import modin.pandas as md - if isinstance(obj, pd.DataFrame): return md.DataFrame(obj) elif isinstance(obj, pd.Series): @@ -161,9 +144,7 @@ class DaskEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> dd.Series | dd.DataFrame: """Convert to dask objects.""" - import dask.dataframe as dd - - return dd.from_pandas(obj) + return dd.from_pandas(obj, npartitions=max(1, len(obj) // 1e6)) class PySparkEngine(DataEngine): @@ -172,11 +153,9 @@ class PySparkEngine(DataEngine): library = "pyspark" @staticmethod - def convert(obj: Pandas) -> pyspark.sql.DataFrame: + def convert(obj: Pandas) -> ps.sql.DataFrame: """Convert to pyspark objects.""" - from pyspark.sql import SparkSession - - spark = SparkSession.builder.appName("atom-ml").getOrCreate() + spark = ps.sql.SparkSession.builder.appName("atom-ml").getOrCreate() return spark.createDataFrame(obj) @@ -186,14 +165,12 @@ class PySparkPandasEngine(DataEngine): library = "pyspark" @staticmethod - def convert(obj: Pandas) -> ps.Series | ps.DataFrame: + def convert(obj: Pandas) -> ps.pandas.Series | ps.pandas.DataFrame: """Convert to pyspark objects.""" - import pyspark.pandas as ps - if isinstance(obj, pd.DataFrame): - return ps.DataFrame(obj) + return ps.pandas.DataFrame(obj) elif isinstance(obj, pd.Series): - return ps.Series(obj) + return ps.pandas.Series(obj) DATA_ENGINES = { @@ -202,6 +179,7 @@ def convert(obj: Pandas) -> ps.Series | ps.DataFrame: "pandas-pyarrow": PandasPyarrowEngine, "polars": PolarsEngine, "polars-lazy": PolarsLazyEngine, + "pyarrow": PyArrowEngine, "modin": ModinEngine, "dask": DaskEngine, "pyspark": PySparkEngine, diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index 07840e0a7..e52563aa0 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -56,7 +56,7 @@ EngineTuple, Estimator, FloatLargerZero, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels, Sequence, - Tabular, Transformer, Verbose, XConstructor, YConstructor, sequence_t, + Tabular, Transformer, Verbose, XConstructor, YConstructor, sequence_t, EngineDataOptions ) from atom.utils.utils import ( Goal, check_is_fitted, composed, crash, get_col_order, get_cols, it, lst, @@ -222,6 +222,44 @@ def inverse_transform( """ return variable_return(X, y) + @composed(crash, method_to_log) + def set_output(self, *, transform: EngineDataOptions | None = None): + """Set output container. + + See sklearn's [user guide][set_output] on how to use the + `set_output` API. See [here][data-acceleration] a description + of the choices. + + Parameters + ---------- + transform: str or None, default=None + Configure the output of the `transform`, `fit_transform`, + and `inverse_transform` method. If None, the configuration + is not changed. Choose from: + + - "numpy" + - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" + - "pyarrow" + - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" + + Returns + ------- + Self + Estimator instance. + + """ + if transform is None: + return self + + self.engine = getattr(self, "engine", EngineTuple()).data = transform + return self + @beartype class Balancer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): diff --git a/atom/models/classreg.py b/atom/models/classreg.py index 50272c83a..87666cc16 100644 --- a/atom/models/classreg.py +++ b/atom/models/classreg.py @@ -8,7 +8,6 @@ from __future__ import annotations from typing import Any, ClassVar, cast - import numpy as np import pandas as pd from optuna.distributions import BaseDistribution diff --git a/atom/pipeline.py b/atom/pipeline.py index 0a00b0b3b..c5711c89c 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -27,7 +27,7 @@ from atom.utils.types import ( Bool, DataFrame, Estimator, FHConstructor, Float, Scalar, Sequence, - Tabular, Verbose, XConstructor, YConstructor, + Tabular, Verbose, XConstructor, YConstructor, EngineDataOptions ) from atom.utils.utils import ( NotFittedError, adjust_verbosity, check_is_fitted, fit_one, @@ -954,6 +954,45 @@ def predict_var( return self.steps[-1][1].predict_var(fh=fh, X=X, cov=cov) + @composed(crash, method_to_log) + def set_output(self, *, transform: EngineDataOptions | None = None): + """Set output container. + + See sklearn's [user guide][set_output] on how to use the + `set_output` API. See [here][data-acceleration] a description + of the choices. + + Parameters + ---------- + transform: str or None, default=None + Configure the output of the `transform`, `fit_transform`, + and `inverse_transform` method. If None, the configuration + is not changed. Choose from: + + - "numpy" + - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" + - "pyarrow" + - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" + + Returns + ------- + Self + Estimator instance. + + """ + if transform is None: + return self + + super().set_output(transform=transform) + self.engine = getattr(self, "engine", EngineTuple()).data = transform + return self + @available_if(_final_estimator_has("score")) def score( self, diff --git a/atom/utils/utils.py b/atom/utils/utils.py index 5231cffbd..a2163ec5c 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -1402,10 +1402,15 @@ def n_cols(obj: XSelector | YSelector) -> int: return obj.shape[1] if len(obj.shape) > 1 else 1 elif isinstance(obj, dict): return len(obj) - elif (array := np.asarray(obj)).ndim > 1: - return array.shape[1] - else: - return array.ndim + + try: + if (array := np.asarray(obj)).ndim > 1: + return array.shape[1] + else: + return array.ndim + except ValueError: + # Fails for inhomogeneous data, return series + return 1 def get_cols(obj: Pandas) -> list[pd.Series]: @@ -1428,18 +1433,19 @@ def get_cols(obj: Pandas) -> list[pd.Series]: return [obj[col] for col in obj.columns] -def get_col_names(obj: Tabular | None) -> list[str] | None: +def get_col_names(obj: Any) -> list[str] | None: """Get a list of column names in tabular objects. Parameters ---------- - obj: series, dataframe or None + obj: object Element to get the column names from. Returns ------- list of str - Names of the columns. + Names of the columns. Returns None when the object passed is + no pandas object. """ if isinstance(obj, pd.DataFrame): @@ -1451,9 +1457,9 @@ def get_col_names(obj: Tabular | None) -> list[str] | None: def variable_return( - X: DataFrame | None, - y: Series | None, -) -> DataFrame | Series | tuple[DataFrame, Tabular]: + X: pd.DataFrame | None, + y: pd.Series | None, +) -> pd.DataFrame | pd.Series | tuple[pd.DataFrame, Pandas]: """Return one or two arguments depending on which is None. This utility is used to make methods return only the provided @@ -1461,15 +1467,15 @@ def variable_return( Parameters ---------- - X: dataframe or None + X: pd.DataFrame or None Feature set. - y: series, dataframe or None + y: pd.Series, pd.DataFrame or None Target column(s). Returns ------- - dataframe, series or tuple + pd.Series, pd.DataFrame or tuple Data sets that are not None. """ @@ -1821,7 +1827,9 @@ def to_df( """ if not isinstance(data, pd.DataFrame | None): - if hasattr(data, "__dataframe__"): + if hasattr(data, "to_pandas"): + data_c = data.to_pandas() + elif hasattr(data, "__dataframe__"): # Transform from dataframe interchange protocol data_c = pd.api.interchange.from_dataframe(data.__dataframe__()) else: @@ -1842,8 +1850,8 @@ def to_df( data_c = data_c[list(columns)] # Force order determined by columns except KeyError: raise ValueError( - f"The columns are different than seen at fit time. " - f"Features {set(data_c.columns) - set(columns)} are missing in X." + f"The columns are different than seen at fit time. Features " + f"{set(data_c.columns) - set(columns)} are missing in X." ) from None return data_c @@ -1891,16 +1899,18 @@ def to_series( """ if not isinstance(data, pd.Series | None): - if isinstance(data, md.Series): - data_c = data - elif isinstance(data, pl.Series): - data_c = data.to_pandas(use_pyarrow_extension_array=True) - elif isinstance(data, pa.Array | pa.ChunkedArray): - data_c = data.to_pandas(types_mapper=pd.ArrowDtype) + if hasattr(data, "to_pandas"): + data_c = data.to_pandas() else: - # Flatten for arrays with shape=(n_samples, 1) + try: + # Flatten for arrays with shape=(n_samples, 1) + array = np.asarray(data).ravel().tolist() + except ValueError: + # Fails for inhomogeneous data + array = data + data_c = pd.Series( - data=np.asarray(data).ravel().tolist(), + data=array, index=index, name=name or "target", ) @@ -1914,7 +1924,7 @@ def to_series( def to_tabular( data: Literal[None], index: Axes | None = ..., - columns: Axes | None = ..., + columns: str | Axes | None = ..., ) -> None: ... @@ -1922,14 +1932,14 @@ def to_tabular( def to_tabular( data: YConstructor, index: Axes | None = ..., - columns: Axes | None = ..., + columns: str | Axes | None = ..., ) -> Tabular: ... def to_tabular( data: YConstructor | None, index: Axes | None = None, - columns: Axes | None = None, + columns: str | Axes | None = None, ) -> Tabular | None: """Convert to a tabular pandas type. @@ -1956,7 +1966,7 @@ def to_tabular( if (n_targets := n_cols(data)) == 1: return to_series(data, index=index, name=flt(columns)) else: - if columns is None: + if columns is None and not hasattr(data, "__dataframe__"): columns = [f"y{i}" for i in range(n_targets)] return to_df(data, index=index, columns=columns) @@ -2710,11 +2720,14 @@ def wrapper( ) if "y" in sign(f): - Xt, yt = f(self, Xt, yt, **kwargs) - return self._convert(Xt), self._convert(yt) + out = f(self, Xt, yt, **kwargs) + else: + out = f(self, Xt, **kwargs) + + if isinstance(out, tuple): + return tuple(self._convert(x) for x in out) else: - Xt = f(self, Xt, **kwargs) - return self._convert(Xt) + return self._convert(out) return wrapper diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md index 291e317d5..97dc68b4a 100644 --- a/docs_sources/dependencies.md +++ b/docs_sources/dependencies.md @@ -40,7 +40,7 @@ packages are necessary for its correct functioning. * **[nltk](https://www.nltk.org/)** (>=3.8.1) * **[numpy](https://numpy.org/)** (>=1.23.0) * **[optuna](https://optuna.org/)** (>=3.4.0) -* **[pandas[parquet]](https://pandas.pydata.org/)** (>=2.1.2) +* **[pandas](https://pandas.pydata.org/)** (>=2.1.2) * **[plotly](https://plotly.com/python/)** (>=5.18.0) * **[polars](https://pola.rs/)** (>=0.20.7) * **[pyarrow](https://arrow.apache.org/docs/python/)** (>=15.0.0) diff --git a/docs_sources/scripts/autodocs.py b/docs_sources/scripts/autodocs.py index 84e3665e8..6c56c1738 100644 --- a/docs_sources/scripts/autodocs.py +++ b/docs_sources/scripts/autodocs.py @@ -85,6 +85,7 @@ votingregressor="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingRegressor.html", ensembleforecaster="https://www.sktime.net/en/latest/api_reference/auto_generated/sktime.forecasting.compose.EnsembleForecaster.html", # Data cleaning + set_output="https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html", clustercentroids="https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.ClusterCentroids.html", onehotencoder="https://contrib.scikit-learn.org/category_encoders/onehot.html", hashingencoder="https://contrib.scikit-learn.org/category_encoders/hashing.html", diff --git a/docs_sources/user_guide/accelerating.md b/docs_sources/user_guide/accelerating.md index 1200a2d3f..0892be9ac 100644 --- a/docs_sources/user_guide/accelerating.md +++ b/docs_sources/user_guide/accelerating.md @@ -60,7 +60,7 @@ tasks), and both these libraries use numpy as their computation backend. Since `atom` relies heavily on column names, it uses pandas (which in turn uses numpy) as its data backend. However, for the convenience of the user, it implements several data engines, that wraps the data in a different type when called by the -user. This is very similar to sklearn's [set_output](https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html) behaviour, but ATOM +user. This is very similar to sklearn's [set_output][] behaviour, but ATOM extends this to many more data types. For example, selecting the `polars` data engine, makes `atom.dataset` return a polars dataframe and `atom.winner.predict(X)` return a polars series. diff --git a/pyproject.toml b/pyproject.toml index 5c3297c80..1948432e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ "nltk>=3.8.1", "numpy>=1.23.0", "optuna>=3.4.0", - "pandas[parquet]>=2.1.2", + "pandas>=2.1.2", "plotly>=5.18.0", "polars>=0.20.7", "pyarrow>=15.0.0", diff --git a/tests/test_baserunner.py b/tests/test_baserunner.py index 1a99684c0..4cf8dd589 100644 --- a/tests/test_baserunner.py +++ b/tests/test_baserunner.py @@ -319,7 +319,175 @@ def test_results_property_train_sizing(): assert list(atom.results.index.get_level_values(0)) == [0.2, 0.4, 0.6, 0.8, 1.0] -# Test _set_index ================================================== >> +# Test _check_input ============================================== >> + +def test_input_is_copied(): + """Assert that the data is copied.""" + X, y = BaseTransformer._check_input(X_bin, y_bin) + assert X is not X_bin + assert y is not y_bin + + +def test_input_X_and_y_None(): + """Assert that an error is raised when both X and y are None.""" + with pytest.raises(ValueError, match=".*both None.*"): + BaseTransformer._check_input() + + +def test_X_is_callable(): + """Assert that the data provided can be a callable.""" + X, _ = BaseTransformer._check_input(lambda: [[1, 2], [2, 1], [3, 1]]) + assert isinstance(X, pd.DataFrame) + + +def test_to_pandas(): + """Assert that the data provided is converted to pandas objects.""" + X, y = BaseTransformer._check_input(X_bin_array, y_bin_array) + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) + + +def test_column_order_is_retained(): + """Assert that column order is kept if column names are specified.""" + X_shuffled = X_bin[sample(list(X_bin.columns), X_bin.shape[1])] + X, _ = BaseTransformer._check_input(X_shuffled, columns=X_bin.columns) + assert list(X.columns) == list(X_bin.columns) + + +def test_incorrect_columns(): + """Assert that an error is raised when the provided columns do not match.""" + with pytest.raises(ValueError, match=".*features are different.*"): + BaseTransformer._check_input(X_bin, columns=["1", "2"]) + + +def test_input_data_in_atom(): + """Assert that the data does not change once in an atom pipeline.""" + atom = ATOMClassifier(X10, y10, random_state=1) + X10[3][2] = 99 # Change an item of the original variable + assert 99 not in atom.dataset # Is unchanged in the pipeline + + +def test_input_data_in_training(): + """Assert that the data does not change once in a training pipeline.""" + train = bin_train.copy() + trainer = DirectClassifier("LR", random_state=1) + trainer.run(train, bin_test) + train.iloc[3, 2] = 99 # Change an item of the original variable + assert 99 not in trainer.dataset # Is unchanged in the pipeline + + +def test_text_to_corpus(): + """Assert that for text data the column is named corpus.""" + atom = ATOMClassifier(X_text, y10, random_state=1) + assert atom.X.columns == ["corpus"] + + +def test_int_columns_to_str(): + """Assert that int columns are converted to str.""" + X = X_bin.copy() + X.columns = range(X.shape[1]) + atom = ATOMClassifier(X, y_bin, random_state=1) + assert atom.X.columns[0] == "0" + + +def test_duplicate_column_names_in_X(): + """Assert that an error is raised when X has duplicate column names.""" + X = merge(X_bin.copy(), pd.Series(1, name="mean texture")) + with pytest.raises(ValueError, match=".*column names found in X.*"): + ATOMClassifier(X, y_bin, random_state=1) + + +def test_sparse_matrices_X_y(): + """Assert that sparse matrices are accepted as (X, y) input.""" + atom = ATOMClassifier(X_sparse, y10, random_state=1) + assert isinstance(atom.X, pd.DataFrame) + assert atom.shape == (10, 4) + assert atom[atom.columns[0]].dtype.name == "Sparse[int64, 0]" + + +def test_sparse_matrices_2_tuples(): + """Assert that sparse matrices are accepted as 2-tuples input.""" + atom = ATOMClassifier((X_sparse, y10), (X_sparse, y10), random_state=1) + assert isinstance(atom.X, pd.DataFrame) + assert atom.shape == (20, 4) + assert atom[atom.columns[0]].dtype.name == "Sparse[int64, 0]" + + +def test_target_is_dict(): + """Assert that the target column is assigned correctly for a dict.""" + _, y = BaseTransformer._check_input(X10, {"a": [0] * 10}) + assert isinstance(y, pd.Series) + + +def test_multioutput_str(): + """Assert that multioutput can be assigned by column name.""" + X, y = BaseTransformer._check_input(X_bin, ["mean radius", "worst perimeter"]) + assert list(y.columns) == ["mean radius", "worst perimeter"] + + +def test_multioutput_int(): + """Assert that multioutput can be assigned by column position.""" + X, y = BaseTransformer._check_input(X_bin, [0, 2]) + assert list(y.columns) == ["mean radius", "mean perimeter"] + + +def test_equal_length(): + """Assert that an error is raised when X and y have unequal length.""" + with pytest.raises(ValueError, match=".*number of rows.*"): + BaseTransformer._check_input(X10, [312, 22]) + + +def test_equal_index(): + """Assert that an error is raised when X and y don't have same indices.""" + y = pd.Series(y_bin_array, index=range(10, len(y_bin_array) + 10)) + with pytest.raises(ValueError, match=".*same indices.*"): + BaseTransformer._check_input(X_bin, y) + + +def test_target_is_string(): + """Assert that the target column is assigned correctly for a string.""" + _, y = BaseTransformer._check_input(X_bin, y="mean radius") + assert y.name == "mean radius" + + +def test_target_not_in_dataset(): + """Assert that the target column given by y is in X.""" + with pytest.raises(ValueError, match=".*not found in X.*"): + BaseTransformer._check_input(X_bin, "X") + + +def test_X_is_None_with_str(): + """Assert that an error is raised when X is None and y is a string.""" + with pytest.raises(ValueError, match=".*can't be None when y is a str.*"): + BaseTransformer._check_input(y="test") + + +def test_target_is_int(): + """Assert that target column is assigned correctly for an integer.""" + _, y = BaseTransformer._check_input(X_bin, y=0) + assert y.name == "mean radius" + + +def test_X_is_None_with_int(): + """Assert that an error is raised when X is None and y is an int.""" + with pytest.raises(ValueError, match=".*can't be None when y is an int.*"): + BaseTransformer._check_input(y=1) + + +def test_target_is_none(): + """Assert that target column stays None when empty input.""" + _, y = BaseTransformer._check_input(X_bin, y=None) + assert y is None + + +def test_X_empty_df(): + """Assert that X becomes an empty dataframe when provided but in y.""" + X, y = BaseTransformer._check_input(y_fc, y=-1) + assert X.empty + assert isinstance(y, pd.Series) + + +# Test _get_data =================================================== >> def test_index_is_true(): """Assert that the indices are left as is when index=True.""" @@ -406,8 +574,6 @@ def test_duplicate_indices(): ATOMClassifier(X_bin, X_bin, index=True, random_state=1) -# Test _get_stratify_columns======================================== >> - @pytest.mark.parametrize("stratify", [True, -1, "target", [-1]]) def test_stratify_options(stratify): """Assert that the data can be stratified among data sets.""" @@ -437,8 +603,6 @@ def test_stratify_invalid_column_str(): ATOMClassifier(X_bin, y_bin, stratify="invalid", random_state=1) -# Test _get_data =================================================== >> - def test_input_is_y_without_arrays(): """Assert that input y through parameter works.""" atom = ATOMForecaster(y=y_fc, random_state=1) diff --git a/tests/test_basetransformer.py b/tests/test_basetransformer.py index b6915b250..cdddef32b 100644 --- a/tests/test_basetransformer.py +++ b/tests/test_basetransformer.py @@ -224,174 +224,6 @@ def test_get_est_class_from_default(): assert base._get_est_class("GaussianNB", "naive_bayes") == GaussianNB -# Test _check_input ============================================== >> - -def test_input_is_copied(): - """Assert that the data is copied.""" - X, y = BaseTransformer._check_input(X_bin, y_bin) - assert X is not X_bin - assert y is not y_bin - - -def test_input_X_and_y_None(): - """Assert that an error is raised when both X and y are None.""" - with pytest.raises(ValueError, match=".*both None.*"): - BaseTransformer._check_input() - - -def test_X_is_callable(): - """Assert that the data provided can be a callable.""" - X, _ = BaseTransformer._check_input(lambda: [[1, 2], [2, 1], [3, 1]]) - assert isinstance(X, pd.DataFrame) - - -def test_to_pandas(): - """Assert that the data provided is converted to pandas objects.""" - X, y = BaseTransformer._check_input(X_bin_array, y_bin_array) - assert isinstance(X, pd.DataFrame) - assert isinstance(y, pd.Series) - - -def test_column_order_is_retained(): - """Assert that column order is kept if column names are specified.""" - X_shuffled = X_bin[sample(list(X_bin.columns), X_bin.shape[1])] - X, _ = BaseTransformer._check_input(X_shuffled, columns=X_bin.columns) - assert list(X.columns) == list(X_bin.columns) - - -def test_incorrect_columns(): - """Assert that an error is raised when the provided columns do not match.""" - with pytest.raises(ValueError, match=".*features are different.*"): - BaseTransformer._check_input(X_bin, columns=["1", "2"]) - - -def test_input_data_in_atom(): - """Assert that the data does not change once in an atom pipeline.""" - atom = ATOMClassifier(X10, y10, random_state=1) - X10[3][2] = 99 # Change an item of the original variable - assert 99 not in atom.dataset # Is unchanged in the pipeline - - -def test_input_data_in_training(): - """Assert that the data does not change once in a training pipeline.""" - train = bin_train.copy() - trainer = DirectClassifier("LR", random_state=1) - trainer.run(train, bin_test) - train.iloc[3, 2] = 99 # Change an item of the original variable - assert 99 not in trainer.dataset # Is unchanged in the pipeline - - -def test_text_to_corpus(): - """Assert that for text data the column is named corpus.""" - atom = ATOMClassifier(X_text, y10, random_state=1) - assert atom.X.columns == ["corpus"] - - -def test_int_columns_to_str(): - """Assert that int columns are converted to str.""" - X = X_bin.copy() - X.columns = range(X.shape[1]) - atom = ATOMClassifier(X, y_bin, random_state=1) - assert atom.X.columns[0] == "0" - - -def test_duplicate_column_names_in_X(): - """Assert that an error is raised when X has duplicate column names.""" - X = merge(X_bin.copy(), pd.Series(1, name="mean texture")) - with pytest.raises(ValueError, match=".*column names found in X.*"): - ATOMClassifier(X, y_bin, random_state=1) - - -def test_sparse_matrices_X_y(): - """Assert that sparse matrices are accepted as (X, y) input.""" - atom = ATOMClassifier(X_sparse, y10, random_state=1) - assert isinstance(atom.X, pd.DataFrame) - assert atom.shape == (10, 4) - assert atom[atom.columns[0]].dtype.name == "Sparse[int64, 0]" - - -def test_sparse_matrices_2_tuples(): - """Assert that sparse matrices are accepted as 2-tuples input.""" - atom = ATOMClassifier((X_sparse, y10), (X_sparse, y10), random_state=1) - assert isinstance(atom.X, pd.DataFrame) - assert atom.shape == (20, 4) - assert atom[atom.columns[0]].dtype.name == "Sparse[int64, 0]" - - -def test_target_is_dict(): - """Assert that the target column is assigned correctly for a dict.""" - _, y = BaseTransformer._check_input(X10, {"a": [0] * 10}) - assert isinstance(y, pd.Series) - - -def test_multioutput_str(): - """Assert that multioutput can be assigned by column name.""" - X, y = BaseTransformer._check_input(X_bin, ["mean radius", "worst perimeter"]) - assert list(y.columns) == ["mean radius", "worst perimeter"] - - -def test_multioutput_int(): - """Assert that multioutput can be assigned by column position.""" - X, y = BaseTransformer._check_input(X_bin, [0, 2]) - assert list(y.columns) == ["mean radius", "mean perimeter"] - - -def test_equal_length(): - """Assert that an error is raised when X and y have unequal length.""" - with pytest.raises(ValueError, match=".*number of rows.*"): - BaseTransformer._check_input(X10, [312, 22]) - - -def test_equal_index(): - """Assert that an error is raised when X and y don't have same indices.""" - y = pd.Series(y_bin_array, index=range(10, len(y_bin_array) + 10)) - with pytest.raises(ValueError, match=".*same indices.*"): - BaseTransformer._check_input(X_bin, y) - - -def test_target_is_string(): - """Assert that the target column is assigned correctly for a string.""" - _, y = BaseTransformer._check_input(X_bin, y="mean radius") - assert y.name == "mean radius" - - -def test_target_not_in_dataset(): - """Assert that the target column given by y is in X.""" - with pytest.raises(ValueError, match=".*not found in X.*"): - BaseTransformer._check_input(X_bin, "X") - - -def test_X_is_None_with_str(): - """Assert that an error is raised when X is None and y is a string.""" - with pytest.raises(ValueError, match=".*can't be None when y is a str.*"): - BaseTransformer._check_input(y="test") - - -def test_target_is_int(): - """Assert that target column is assigned correctly for an integer.""" - _, y = BaseTransformer._check_input(X_bin, y=0) - assert y.name == "mean radius" - - -def test_X_is_None_with_int(): - """Assert that an error is raised when X is None and y is an int.""" - with pytest.raises(ValueError, match=".*can't be None when y is an int.*"): - BaseTransformer._check_input(y=1) - - -def test_target_is_none(): - """Assert that target column stays None when empty input.""" - _, y = BaseTransformer._check_input(X_bin, y=None) - assert y is None - - -def test_X_empty_df(): - """Assert that X becomes an empty dataframe when provided but in y.""" - X, y = BaseTransformer._check_input(y_fc, y=-1) - assert X.empty - assert isinstance(y, pd.Series) - - # Test log ========================================================= >> def test_log_severity_error(): diff --git a/tests/test_branch.py b/tests/test_branch.py index 85a65af64..b4d44cc53 100644 --- a/tests/test_branch.py +++ b/tests/test_branch.py @@ -7,18 +7,20 @@ import glob import os from pathlib import Path - +import polars as pl import numpy as np import pandas as pd import pytest from pandas.testing import assert_frame_equal from sklearn.preprocessing import MinMaxScaler, StandardScaler - +import pyarrow as pa +from unittest.mock import patch, MagicMock from atom import ATOMClassifier, ATOMRegressor from atom.branch import Branch, BranchManager from atom.training import DirectClassifier from atom.utils.utils import merge - +import modin.pandas as md +import dask.dataframe as dd from .conftest import ( X10, X10_str, X_bin, X_bin_array, X_class, X_idx, y10, y10_str, y_bin, y_bin_array, y_idx, y_multiclass, @@ -306,7 +308,7 @@ def test_setter_error_unequal_columns(): """Assert that an error is raised when the setter has unequal columns.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) new_X = atom.train - new_X.insert(0, "new_column", 1) + new_X["new_column"] = 1 with pytest.raises(ValueError, match="number of columns"): atom.train = new_X @@ -702,16 +704,66 @@ def test_numpy_engine(): def test_pandas_numpy_engine(): - """Assert that the pandas numpy engine returns a pandas dataframe.""" + """Assert that the pandas engine returns numpy dtypes.""" atom = ATOMClassifier(X_bin, y_bin, engine="pandas", random_state=1) assert all(isinstance(dtype, np.dtype) for dtype in atom.dataset.dtypes) assert isinstance(atom.y.dtype, np.dtype) def test_pandas_pyarrow_engine(): - """Assert that the pandas pyarrow engine returns pyarrow dtypes.""" + """Assert that the pandas-pyarrow engine returns pyarrow dtypes.""" atom = ATOMClassifier(X_bin, y_bin, engine="pandas-pyarrow", random_state=1) assert all(isinstance(dtype, pd.ArrowDtype) for dtype in atom.dataset.dtypes) assert isinstance(atom.y.dtype, pd.ArrowDtype) +def test_polars_engine(): + """Assert that the polars engine returns polars types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="polars", random_state=1) + assert isinstance(atom.X, pl.DataFrame) + assert isinstance(atom.y, pl.Series) + + +def test_polars_lazy_engine(): + """Assert that the polars-lazy engine returns polars types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="polars-lazy", random_state=1) + assert isinstance(atom.X, pl.LazyFrame) + assert isinstance(atom.y, pl.Series) + + +def test_pyarrow_engine(): + """Assert that the pyarrow engine returns pyarrow types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pyarrow", random_state=1) + assert isinstance(atom.X, pa.Table) + assert isinstance(atom.y, pa.Array) + + +def test_modin_engine(): + """Assert that the modin engine returns modin types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="modin", random_state=1) + assert isinstance(atom.X, md.DataFrame) + assert isinstance(atom.y, md.Series) + + +def test_dask_engine(): + """Assert that the dask engine returns dask types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="dask", random_state=1) + assert isinstance(atom.X, dd.DataFrame) + assert isinstance(atom.y, dd.Series) + + +@patch.dict("sys.modules", {"pyspark": MagicMock(spec=["__spec__", "sql"])}) +def test_pyspark_engine(): + """Assert that the pyspark engine returns pyspark types.""" + import sys + print(sys.modules) + atom = ATOMClassifier(X_bin, y_bin, engine="pyspark", random_state=1) + assert "createDataFrame" in str(atom.X) + + +@patch.dict("sys.modules", {"pyspark": MagicMock(spec=["__spec__", "pandas"])}) +def test_pyspark_pandas_engine(): + """Assert that the pyspark-pandas engine returns pyspark pandas types.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pyspark-pandas", random_state=1) + assert "DataFrame" in str(atom.X) + assert "Series" in str(atom.y) From 8a1a8a8dbcf784f296b17692e50210af9a44c6bb Mon Sep 17 00:00:00 2001 From: Mavs Date: Wed, 14 Feb 2024 20:30:44 +0100 Subject: [PATCH 03/12] dataengines 4 --- atom/atom.py | 12 +- atom/basemodel.py | 63 +++--- atom/baserunner.py | 237 +++++--------------- atom/basetrainer.py | 2 +- atom/basetransformer.py | 185 ++++++++++++++-- atom/branch/branch.py | 2 +- atom/branch/dataengines.py | 17 +- atom/data_cleaning.py | 276 ++++++++++-------------- atom/feature_engineering.py | 26 +-- atom/models/classreg.py | 2 +- atom/nlp.py | 17 +- atom/pipeline.py | 32 ++- atom/plots/dataplot.py | 8 +- atom/utils/patches.py | 18 -- atom/utils/types.py | 15 +- atom/utils/utils.py | 52 ++--- docs_sources/scripts/autodocs.py | 4 +- docs_sources/user_guide/accelerating.md | 6 +- tests/conftest.py | 24 ++- tests/test_atom.py | 17 +- tests/test_baserunner.py | 170 +-------------- tests/test_basetrainer.py | 5 +- tests/test_basetransformer.py | 168 +++++++++++++++ 23 files changed, 654 insertions(+), 704 deletions(-) diff --git a/atom/atom.py b/atom/atom.py index 573be0ba2..5eac1fad2 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -47,14 +47,14 @@ ) from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING, __version__ from atom.utils.types import ( - Backend, Bins, Bool, CategoricalStrats, ColumnSelector, DataFrame, + Backend, Bins, Bool, CategoricalStrats, ColumnSelector, DiscretizerStrats, Engine, EngineTuple, Estimator, FeatureNamesOut, FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, IndexSelector, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, MetricConstructor, ModelsConstructor, NItems, NJobs, NormalizerStrats, NumericalStrats, Operators, Predictor, PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality, Sequence, - SPDict, Tabular, TargetSelector, Transformer, VectorizerStarts, Verbose, + SPDict, Pandas, TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings, XSelector, YSelector, sequence_t, ) from atom.utils.utils import ( @@ -672,7 +672,7 @@ def inverse_transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Tabular | tuple[DataFrame, Tabular]: + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Inversely transform new data through the pipeline. Transformers that are only applied on the training set are @@ -1096,7 +1096,7 @@ def transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Tabular | tuple[DataFrame, Tabular]: + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Transform new data through the pipeline. Transformers that are only applied on the training set are @@ -1454,8 +1454,8 @@ def add( @composed(crash, method_to_log) def apply( self, - func: Callable[..., DataFrame], - inverse_func: Callable[..., DataFrame] | None = None, + func: Callable[..., pd.DataFrame], + inverse_func: Callable[..., pd.DataFrame] | None = None, *, feature_names_out: FeatureNamesOut = None, kw_args: dict[str, Any] | None = None, diff --git a/atom/basemodel.py b/atom/basemodel.py index 8d0f83785..327041c63 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -66,10 +66,10 @@ from atom.utils.constants import DF_ATTRS from atom.utils.patches import fit_and_score from atom.utils.types import ( - HT, Backend, Bool, DataFrame, Engine, FHConstructor, Float, + HT, Backend, Bool, Engine, FHConstructor, Float, FloatZeroToOneExc, Int, IntLargerEqualZero, MetricConstructor, MetricFunction, NJobs, Pandas, PredictionMethods, PredictionMethodsTS, - Predictor, RowSelector, Scalar, Scorer, Sequence, Stages, Tabular, + Predictor, RowSelector, Scalar, Scorer, Sequence, Stages, Pandas, TargetSelector, Verbose, Warnings, XSelector, YSelector, float_t, int_t, ) from atom.utils.utils import ( @@ -77,7 +77,7 @@ TrialsCallback, adjust_verbosity, cache, check_dependency, check_empty, composed, crash, estimator_has_attr, flt, get_cols, get_custom_scorer, has_task, it, lst, merge, method_to_log, rnd, sign, time_to_str, to_df, - to_series, to_tabular, + to_series, to_tabular, get_col_names ) @@ -305,7 +305,7 @@ def __contains__(self, item: str) -> bool: """Whether the item is a column in the dataset.""" return item in self.dataset - def __getitem__(self, item: Int | str | list) -> Tabular: + def __getitem__(self, item: Int | str | list) -> Pandas: """Get a subset from the dataset.""" if isinstance(item, int_t): return self.dataset[self.columns[int(item)]] @@ -649,7 +649,7 @@ def _get_pred( rows: RowSelector, target: TargetSelector | None = None, method: PredictionMethods | Sequence[PredictionMethods] = "predict", - ) -> tuple[Tabular, Tabular]: + ) -> tuple[Pandas, Pandas]: """Get the true and predicted values for a column. Predictions are made using the `decision_function` or @@ -765,8 +765,7 @@ def _score_from_est( y_pred = to_tabular( data=estimator.predict(X), index=y.index, - columns=getattr(y, "columns", None), - name=getattr(y, "name", None), + columns=get_col_names(y), ) return self._score_from_pred(scorer, y, y_pred, **kwargs) @@ -774,8 +773,8 @@ def _score_from_est( def _score_from_pred( self, scorer: Scorer, - y_true: Tabular, - y_pred: Tabular, + y_true: Pandas, + y_pred: Pandas, **kwargs, ) -> Float: """Calculate the metric score from predicted values. @@ -2238,11 +2237,11 @@ def inverse_transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Tabular | tuple[DataFrame, Tabular]: + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Inversely transform new data through the pipeline. Transformers that are only applied on the training set are - skipped. The rest should all implement a `inverse_transform` + skipped. The rest should all implement an `inverse_transform` method. If only `X` or only `y` is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target @@ -2437,7 +2436,7 @@ def transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Tabular | tuple[DataFrame, Tabular]: + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Transform new data through the pipeline. Transformers that are only applied on the training set are @@ -2451,7 +2450,7 @@ def transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. If None, + `X` is ignored. If None, `X` is ignored in the transformers. y: int, str, dict, sequence, dataframe or None, default=None @@ -2534,7 +2533,7 @@ def _prediction( sample_weight: Sequence[Scalar] | None = ..., verbose: Int | None = ..., method: PredictionMethods = ..., - ) -> Tabular: ... + ) -> Pandas: ... def _prediction( self, @@ -2544,7 +2543,7 @@ def _prediction( sample_weight: Sequence[Scalar] | None = None, verbose: Int | None = None, method: PredictionMethods = "predict", - ) -> Float | Tabular: + ) -> Float | Pandas: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2594,7 +2593,7 @@ def _prediction( """ - def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[DataFrame, Tabular]: + def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[pd.DataFrame, Pandas]: """Get X and y from the pipeline transformation. Parameters @@ -2693,7 +2692,7 @@ def decision_function( X: RowSelector | XSelector, *, verbose: Int | None = None, - ) -> Tabular: + ) -> Pandas: """Get confidence scores on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2732,7 +2731,7 @@ def predict( *, inverse: Bool = True, verbose: Int | None = None, - ) -> Tabular: + ) -> Pandas: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2779,7 +2778,7 @@ def predict_log_proba( X: RowSelector | XSelector, *, verbose: Int | None = None, - ) -> DataFrame: + ) -> pd.DataFrame: """Get class log-probabilities on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2801,7 +2800,7 @@ def predict_log_proba( Returns ------- - dataframe + pd.DataFrame Predicted class log-probabilities with shape=(n_samples, n_classes) or shape=(n_samples * n_classes, n_targets) with a multiindex format for [multioutput tasks][]. @@ -2816,7 +2815,7 @@ def predict_proba( X: RowSelector | XSelector, *, verbose: Int | None = None, - ) -> DataFrame: + ) -> pd.DataFrame: """Get class probabilities on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2838,7 +2837,7 @@ def predict_proba( Returns ------- - dataframe + pd.DataFrame Predicted class probabilities with shape=(n_samples, n_classes) or shape=(n_samples * n_classes, n_targets) with a multiindex format for [multioutput tasks][]. @@ -2967,7 +2966,7 @@ def _prediction( verbose: Int | None = None, method: PredictionMethodsTS = ..., **kwargs, - ) -> Tabular: ... + ) -> Pandas: ... def _prediction( self, @@ -2978,7 +2977,7 @@ def _prediction( verbose: Int | None = None, method: PredictionMethodsTS = "predict", **kwargs, - ) -> Float | Tabular: + ) -> Float | Pandas: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3054,7 +3053,7 @@ def predict( *, inverse: Bool = True, verbose: Int | None = None, - ) -> Tabular: + ) -> Pandas: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3105,7 +3104,7 @@ def predict_interval( *, coverage: Float | Sequence[Float] = 0.9, verbose: Int | None = None, - ) -> DataFrame: + ) -> pd.DataFrame: """Get prediction intervals on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3132,7 +3131,7 @@ def predict_interval( Returns ------- - dataframe + pd.DataFrame Computed interval forecasts. """ @@ -3201,7 +3200,7 @@ def predict_quantiles( *, alpha: Float | Sequence[Float] = (0.05, 0.95), verbose: Int | None = None, - ) -> DataFrame: + ) -> pd.DataFrame: """Get quantile forecasts on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3229,7 +3228,7 @@ def predict_quantiles( Returns ------- - dataframe + pd.DataFrame Computed quantile forecasts. """ @@ -3249,7 +3248,7 @@ def predict_residuals( X: XSelector | None = None, *, verbose: Int | None = None, - ) -> Tabular: + ) -> Pandas: """Get residuals of forecasts on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3288,7 +3287,7 @@ def predict_var( *, cov: Bool = False, verbose: Int | None = None, - ) -> DataFrame: + ) -> pd.DataFrame: """Get variance forecasts on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3316,7 +3315,7 @@ def predict_var( Returns ------- - dataframe + pd.DataFrame Computed variance forecasts. """ diff --git a/atom/baserunner.py b/atom/baserunner.py index d05a76e88..01073859d 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -37,10 +37,10 @@ from atom.pipeline import Pipeline from atom.utils.constants import DF_ATTRS from atom.utils.types import ( - Bool, DataFrame, FloatZeroToOneExc, HarmonicsSelector, IndexSelector, Int, + Bool, FloatZeroToOneExc, HarmonicsSelector, IndexSelector, Int, IntLargerOne, MetricConstructor, Model, ModelSelector, ModelsSelector, - RowSelector, Seasonality, Segment, Sequence, Series, SPDict, SPTuple, - Tabular, TargetSelector, XSelector, YSelector, bool_t, int_t, pandas_t, + RowSelector, Seasonality, Segment, Sequence, SPDict, SPTuple, + Pandas, TargetSelector, XSelector, YSelector, bool_t, int_t, pandas_t, segment_t, sequence_t, ) from atom.utils.utils import ( @@ -217,7 +217,7 @@ def branch(self) -> Branch: return self._branches.current @property - def holdout(self) -> DataFrame | None: + def holdout(self) -> pd.DataFrame | None: """Holdout set. This data set is untransformed by the pipeline. Read more in @@ -331,141 +331,6 @@ def frac(m: Model) -> float: # Utility methods ============================================== >> - @staticmethod - @overload - def _check_input(X: XSelector, y: Literal[None]) -> tuple[DataFrame, None]: ... - - @staticmethod - @overload - def _check_input(X: Literal[None], y: YSelector) -> tuple[None, Tabular]: ... - - @staticmethod - @overload - def _check_input(X: XSelector, y: YSelector) -> tuple[DataFrame, Tabular]: ... - - @staticmethod - def _check_input( - X: XSelector | None = None, - y: YSelector | None = None, - *, - columns: list[str] | None = None, - name: str | list[str] | None = None, - ) -> tuple[DataFrame | None, Tabular | None]: - """Prepare the input data. - - Convert X and y to pandas (if not already) and perform standard - compatibility checks (dimensions, length, indices, etc...). - - Parameters - ---------- - X: dataframe-like or None, default=None - Feature set with shape=(n_samples, n_features). If None, - X is ignored. - - y: int, str, dict, sequence, dataframe or None, default=None - Target column(s) corresponding to `X`. - - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. - - columns: list, default=None - Column names for the feature set. If None, default names - are used. - - name: str, default=None - Name of the target column(s). If None, a default name is - used. - - Returns - ------- - dataframe or None - Feature dataset. Only returned if provided. - - series, dataframe or None - Target column(s) corresponding to `X`. - - """ - Xt: pd.DataFrame | None = None - yt: Pandas | None = None - - if X is None and y is None: - raise ValueError("X and y can't be both None!") - elif X is not None: - Xt = to_df(deepcopy(X() if callable(X) else X), columns=columns) - - # If text dataset, change the name of the column to corpus - if list(Xt.columns) == ["x0"] and Xt[Xt.columns[0]].dtype == "object": - Xt = Xt.rename(columns={Xt.columns[0]: "corpus"}) - else: - # Convert all column names to str - Xt.columns = Xt.columns.astype(str) - - # No duplicate rows nor column names are allowed - if Xt.columns.duplicated().any(): - raise ValueError("Duplicate column names found in X.") - - # Prepare target column - if isinstance(y, (dict, *sequence_t, DataFrame)): - if isinstance(y, dict): - yt = to_tabular(deepcopy(y), index=getattr(Xt, "index", None), columns=name) - else: - # If X and y have different number of rows, try multioutput - if Xt is not None and len(Xt) != len(y): - try: - targets: list[Hashable] = [] - for col in y: - if col in Xt.columns: - targets.append(col) - elif isinstance(col, int_t): - if -Xt.shape[1] <= col < Xt.shape[1]: - targets.append(Xt.columns[int(col)]) - else: - raise IndexError( - "Invalid value for the y parameter. Value " - f"{col} is out of range for data with " - f"{Xt.shape[1]} columns." - ) - - Xt, yt = Xt.drop(columns=targets), Xt[targets] - - except (TypeError, IndexError, KeyError): - raise ValueError( - "X and y don't have the same number of rows," - f" got len(X)={len(Xt)} and len(y)={len(y)}." - ) from None - else: - yt = y - - yt = to_tabular(deepcopy(yt), index=getattr(Xt, "index", None), columns=name) - - # Check X and y have the same indices - if Xt is not None and not Xt.index.equals(yt.index): - raise ValueError("X and y don't have the same indices!") - - elif isinstance(y, str): - if Xt is not None: - if y not in Xt.columns: - raise ValueError(f"Column {y} not found in X!") - - Xt, yt = Xt.drop(columns=y), Xt[y] - - else: - raise ValueError("X can't be None when y is a string.") - - elif isinstance(y, int_t): - if Xt is None: - raise ValueError("X can't be None when y is an int.") - - Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]] - - return Xt, yt - def _get_sp(self, sp: Seasonality) -> int | list[int] | None: """Get the seasonal period. @@ -531,7 +396,7 @@ def _get_data( arrays: tuple[Any, ...], y: YSelector = -1, *, - index: IndexSelector = False, + index: IndexSelector | None = None, ) -> tuple[DataContainer, pd.DataFrame | None]: """Get data sets from a sequence of indexables. @@ -546,8 +411,9 @@ def _get_data( y: int, str or sequence, default=-1 Transformed target column. - index: bool, int, str or sequence, default=False - Index parameter as provided in constructor. + index: bool, int, str, sequence or None, default=None + Index parameter as provided in constructor. If None, the + index is retrieved from `self._config`. Returns ------- @@ -559,7 +425,7 @@ def _get_data( """ - def _subsample(df: DataFrame) -> DataFrame: + def _subsample(df: pd.DataFrame) -> pd.DataFrame: """Select a random subset of a dataframe. If shuffle=True, the subset is shuffled, else row order @@ -568,12 +434,12 @@ def _subsample(df: DataFrame) -> DataFrame: Parameters ---------- - df: dataframe + df: pd.DataFrame Dataset. Returns ------- - dataframe + pd.DataFrame Subset of df. """ @@ -589,25 +455,36 @@ def _subsample(df: DataFrame) -> DataFrame: else: return df.iloc[sorted(random.sample(range(len(df)), k=n_rows))] - def _set_index(df: DataFrame, y: Tabular | None) -> DataFrame: + def _set_index( + df: DataFrame, + y: Pandas | None, + index: IndexSelector | None = None, + ) -> pd.DataFrame: """Assign an index to the dataframe. Parameters ---------- - df: dataframe + df: pd.DataFrame Dataset. - y: series, dataframe or None + y: pd.Series, pd.DataFrame or None Target column(s). Used to check that the provided index is not one of the target columns. If None, the check is skipped. + index: bool, int, str or sequence or None, default=None + Index parameter as provided in constructor. If None, the + index is retrieved from `self._config`. + Returns ------- - dataframe + pd.DataFrame Dataset with updated indices. """ + if index is None: + index = self._config.index + if index is True: # True gets caught by isinstance(int) pass elif index is False: @@ -644,9 +521,9 @@ def _set_index(df: DataFrame, y: Tabular | None) -> DataFrame: return df def _no_data_sets( - X: DataFrame, - y: Tabular, - ) -> tuple[DataContainer, DataFrame | None]: + X: pd.DataFrame, + y: Pandas, + ) -> tuple[DataContainer, pd.DataFrame | None]: """Generate data sets from one dataset. Additionally, assigns an index, shuffles the data, selects @@ -655,10 +532,10 @@ def _no_data_sets( Parameters ---------- - X: dataframe + X: pd.DataFrame Feature set with shape=(n_samples, n_features). - y: series or dataframe + y: pd.Series or pd.DataFrame Target column(s) corresponding to `X`. Returns @@ -666,7 +543,7 @@ def _no_data_sets( DataContainer Train and test sets. - dataframe or None + pd.DataFrame or None Holdout data set. Returns None if not specified. """ @@ -739,7 +616,7 @@ def _no_data_sets( stratify=self._config.get_stratify_columns(data, y), ) - complete_set = _set_index(pd.concat([train, test, holdout]), y) + complete_set = _set_index(pd.concat([train, test, holdout]), y, index) container = DataContainer( data=(data := complete_set.iloc[: len(data)]), @@ -766,13 +643,13 @@ def _no_data_sets( return container, holdout def _has_data_sets( - X_train: DataFrame, - y_train: Tabular, - X_test: DataFrame, - y_test: Tabular, - X_holdout: DataFrame | None = None, - y_holdout: Tabular | None = None, - ) -> tuple[DataContainer, DataFrame | None]: + X_train: pd.DataFrame, + y_train: Pandas, + X_test: pd.DataFrame, + y_test: Pandas, + X_holdout: pd.DataFrame | None = None, + y_holdout: Pandas | None = None, + ) -> tuple[DataContainer, pd.DataFrame | None]: """Generate data sets from provided sets. Additionally, assigns an index, shuffles the data and @@ -780,22 +657,22 @@ def _has_data_sets( Parameters ---------- - X_train: dataframe + X_train: pd.DataFrame Training set. - y_train: series or dataframe + y_train: pd.Series or pd.DataFrame Target column(s) corresponding to `X`_train. - X_test: dataframe + X_test: pd.DataFrame Test set. - y_test: series or dataframe + y_test: pd.Series or pd.DataFrame Target column(s) corresponding to `X`_test. - X_holdout: dataframe or None - Holdout set. Is None if not provided by the user. + X_holdout: pd.DataFrame or None, default=None + Holdout set. Can be None if not provided by the user. - y_holdout: series, dataframe or None + y_holdout: pd.Series, pd.DataFrame or None, default=None Target column(s) corresponding to `X`_holdout. Returns @@ -803,7 +680,7 @@ def _has_data_sets( DataContainer Train and test sets. - dataframe or None + pd.DataFrame or None Holdout data set. Returns None if not specified. """ @@ -851,7 +728,7 @@ def _has_data_sets( if holdout is not None: holdout.index = index[-len(holdout):] - complete_set = _set_index(pd.concat([train, test, holdout]), y_test) + complete_set = _set_index(pd.concat([train, test, holdout]), y_test, index) container = DataContainer( data=(data := complete_set.iloc[:len(train) + len(test)]), @@ -868,16 +745,16 @@ def _has_data_sets( # Process input arrays ===================================== >> if len(arrays) == 0: - if self._goal.name == "forecast" and not isinstance(y, (*int_t, str)): + if self.branch._container: + return self.branch._data, self.branch._holdout + elif self._goal is Goal.forecast and not isinstance(y, Int | str): # arrays=() and y=y for forecasting sets = _no_data_sets(*self._check_input(y=y)) - elif not self.branch._container: + else: raise ValueError( "The data arrays are empty! Provide the data to run the pipeline " "successfully. See the documentation for the allowed formats." ) - else: - return self.branch._data, self.branch._holdout elif len(arrays) == 1: # X or y for forecasting @@ -1278,12 +1155,12 @@ def get_class_weight( """ - def get_weights(col: Series) -> dict[Hashable, float]: + def get_weights(col: pd.Series) -> dict[Hashable, float]: """Get the class weights for one column. Parameters ---------- - col: series + col: pd.Series Column to get the weights from. Returns @@ -1304,7 +1181,7 @@ def get_weights(col: Series) -> dict[Hashable, float]: @available_if(has_task("classification")) @composed(crash, beartype) - def get_sample_weight(self, rows: RowSelector = "train") -> Series: + def get_sample_weight(self, rows: RowSelector = "train") -> pd.Series: """Return sample weights for a balanced data set. The returned weights are inversely proportional to the class @@ -1319,7 +1196,7 @@ def get_sample_weight(self, rows: RowSelector = "train") -> Series: Returns ------- - series + pd.Series Sequence of weights with shape=(n_samples,). """ diff --git a/atom/basetrainer.py b/atom/basetrainer.py index 8310d34ae..1d64279ef 100644 --- a/atom/basetrainer.py +++ b/atom/basetrainer.py @@ -70,7 +70,7 @@ def __init__( self._models = lst(models) if models is not None else ClassMap() self._metric = lst(metric) if metric is not None else ClassMap() - self._config = DataConfig() + self._config = DataConfig(index=self._goal is Goal.forecast) self._branches = BranchManager(memory=self.memory) self._n_trials = {} diff --git a/atom/basetransformer.py b/atom/basetransformer.py index 4ba7753b3..067100c01 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -18,28 +18,30 @@ from logging import DEBUG, FileHandler, Formatter, Logger, getLogger from multiprocessing import cpu_count from pathlib import Path -from typing import Any, TypeVar - -import dagshub -import joblib -import mlflow +from typing import Any, TypeVar, overload +from copy import deepcopy import numpy as np import pandas as pd -import ray import requests +from polars.dependencies import _lazy_import from beartype import beartype -from dagshub.auth.token_auth import HTTPBearerAuth -from dask.distributed import Client +import joblib from joblib.memory import Memory -from ray.util.joblib import register_ray from sklearn.utils.validation import check_memory from atom.utils.types import ( Backend, Bool, Engine, EngineDataOptions, EngineEstimatorOptions, EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Severity, - Verbose, Warnings, bool_t, + Verbose, Warnings, bool_t, Sequence, int_t ) -from atom.utils.utils import check_dependency, crash, lst, make_sklearn +from atom.utils.utils import check_dependency, crash, lst, make_sklearn, to_df, to_tabular + + +mlflow, _ = _lazy_import("mlflow") +dagshub, _ = _lazy_import("dagshub") +ray, _ = _lazy_import("ray") +ray_joblib, _ = _lazy_import("ray.util.joblib") +dask, _ = _lazy_import("dask") T_Estimator = TypeVar("T_Estimator", bound=Estimator) @@ -177,14 +179,14 @@ def backend(self) -> Backend: @beartype def backend(self, value: Backend): if value == "ray": - register_ray() # Register ray as joblib backend + ray_joblib.register_ray() # Register ray as joblib backend if not ray.is_initialized(): ray.init(log_to_driver=False) elif value == "dask": try: - Client.current() + dask.distributed.Client.current() except ValueError: - Client(processes=False) + dask.distributed.Client(processes=False) joblib.parallel_config(backend=value) @@ -306,7 +308,7 @@ def experiment(self, value: str | None): # Fetch username from dagshub api username = requests.get( url="https://dagshub.com/api/v1/user", - auth=HTTPBearerAuth(token), + auth=dagshub.auth.token_auth.HTTPBearerAuth(token), timeout=5, ).json()["username"] @@ -357,6 +359,159 @@ def _device_id(self) -> int: # Methods ====================================================== >> + @staticmethod + @overload + def _check_input( + X: XSelector, + y: Literal[None], + *, + columns: Sequence[str] | None = None, + name: str | Sequence[str] | None = None, + ) -> tuple[pd.DataFrame, None]: ... + + @staticmethod + @overload + def _check_input( + X: Literal[None], + y: YSelector, + *, + columns: Sequence[str] | None = None, + name: str | Sequence[str] | None = None, + ) -> tuple[None, Pandas]: ... + + @staticmethod + @overload + def _check_input( + X: XSelector, + y: YSelector, + *, + columns: Sequence[str] | None = None, + name: str | Sequence[str] | None = None, + ) -> tuple[pd.DataFrame, Pandas]: ... + + @staticmethod + def _check_input( + X: XSelector | None = None, + y: YSelector | None = None, + *, + columns: Sequence[str] | None = None, + name: str | Sequence[str] | None = None, + ) -> tuple[pd.DataFrame | None, Pandas | None]: + """Prepare the input data. + + Convert X and y to pandas (if not already) and perform standard + compatibility checks (dimensions, length, indices, etc...). + + Parameters + ---------- + X: dataframe-like or None, default=None + Feature set with shape=(n_samples, n_features). If None, + `X` is ignored. + + y: int, str, dict, sequence, dataframe or None, default=None + Target column(s) corresponding to `X`. + + - If None: `y` is ignored. + - If int: Position of the target column in `X`. + - If str: Name of the target column in `X`. + - If dict: Name of the target column and sequence of values. + - If sequence: Target column with shape=(n_samples,) or + sequence of column names or positions for multioutput + tasks. + - If dataframe: Target columns for multioutput tasks. + + columns: sequence of str or None, default=None + Column names for the feature set. If None, default names + are used. + + name: str, sequence or None, default=None + Name of the target column(s). If None, a default name is + used. + + Returns + ------- + dataframe or None + Feature dataset. Only returned if provided. + + series, dataframe or None + Target column(s) corresponding to `X`. + + """ + Xt: pd.DataFrame | None = None + yt: Pandas | None = None + + if X is None and y is None: + raise ValueError("X and y can't be both None!") + elif X is not None: + Xt = to_df(deepcopy(X() if callable(X) else X), columns=columns) + + # If text dataset, change the name of the column to corpus + if list(Xt.columns) == ["x0"] and Xt[Xt.columns[0]].dtype == "object": + Xt = Xt.rename(columns={Xt.columns[0]: "corpus"}) + else: + # Convert all column names to str + Xt.columns = Xt.columns.astype(str) + + # No duplicate rows nor column names are allowed + if Xt.columns.duplicated().any(): + raise ValueError("Duplicate column names found in X.") + + # Prepare target column + if not isinstance(y, Int | str | None): + if isinstance(y, dict): + yt = to_tabular(deepcopy(y), index=getattr(Xt, "index", None), columns=name) + else: + # If X and y have different number of rows, try multioutput + if Xt is not None and len(Xt) != len(y): + try: + targets: list[Hashable] = [] + for col in y: + if col in Xt.columns: + targets.append(col) + elif isinstance(col, int_t): + if -Xt.shape[1] <= col < Xt.shape[1]: + targets.append(Xt.columns[int(col)]) + else: + raise IndexError( + "Invalid value for the y parameter. Value " + f"{col} is out of range for data with " + f"{Xt.shape[1]} columns." + ) + + Xt, yt = Xt.drop(columns=targets), Xt[targets] + + except (TypeError, IndexError, KeyError): + raise ValueError( + "X and y don't have the same number of rows," + f" got len(X)={len(Xt)} and len(y)={len(y)}." + ) from None + else: + yt = y + + yt = to_tabular(deepcopy(yt), index=getattr(Xt, "index", None), columns=name) + + # Check X and y have the same indices + if Xt is not None and not Xt.index.equals(yt.index): + raise ValueError("X and y don't have the same indices!") + + elif isinstance(y, str): + if Xt is not None: + if y not in Xt.columns: + raise ValueError(f"Column {y} not found in X!") + + Xt, yt = Xt.drop(columns=y), Xt[y] + + else: + raise ValueError("X can't be None when y is a string.") + + elif isinstance(y, int_t): + if Xt is None: + raise ValueError("X can't be None when y is an int.") + + Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]] + + return Xt, yt + def _convert(self, obj: Any) -> Any: """Convert data to the type set in the data engine. diff --git a/atom/branch/branch.py b/atom/branch/branch.py index 70fd628c8..3a501e841 100644 --- a/atom/branch/branch.py +++ b/atom/branch/branch.py @@ -23,7 +23,7 @@ from atom.pipeline import Pipeline from atom.utils.types import ( - Bool, ColumnSelector, DataFrame, Int, IntLargerEqualZero, Pandas, + Bool, ColumnSelector, Int, IntLargerEqualZero, Pandas, RowSelector, Scalar, Sequence, TargetSelector, TargetsSelector, XConstructor, XSelector, YSelector, int_t, segment_t, ) diff --git a/atom/branch/dataengines.py b/atom/branch/dataengines.py index 26dc2fa4a..cebe51f93 100644 --- a/atom/branch/dataengines.py +++ b/atom/branch/dataengines.py @@ -14,7 +14,7 @@ import pandas as pd import polars as pl -from atom.utils.types import Any, DataFrame, Pandas, Sequence +from atom.utils.types import Any, Pandas, Sequence from atom.utils.utils import get_cols import os @@ -38,7 +38,7 @@ class DataEngine(metaclass=ABCMeta): @staticmethod @abstractmethod - def convert(obj: Pandas) -> np.ndarray | Sequence[Any] | DataFrame: ... + def convert(obj: Pandas) -> np.ndarray | Sequence[Any] | pd.DataFrame: ... class NumpyEngine(DataEngine): @@ -52,15 +52,15 @@ def convert(obj: Pandas) -> np.ndarray: return obj.to_numpy() -class PandasNumpyEngine(DataEngine): +class PandasEngine(DataEngine): """Pandas numpy data engine.""" library = "pandas" @staticmethod def convert(obj: Pandas) -> Pandas: - """Convert to numpy dtypes.""" - return obj.astype({c.name: getattr(c.dtype, "numpy_dtype", None) for c in get_cols(obj)}) + """Leave as is.""" + return obj class PandasPyarrowEngine(DataEngine): @@ -73,8 +73,9 @@ def convert(obj: Pandas) -> Pandas: """Convert to pyarrow dtypes.""" return obj.astype( { - col.name: pd.ArrowDtype(pa.from_numpy_dtype(col.dtype)) - if isinstance(col.dtype, np.dtype) else None + col.name: pd.ArrowDtype( + pa.from_numpy_dtype(getattr(col.dtype, "numpy_dtype", col.dtype)) + ) for col in get_cols(obj) } ) @@ -175,7 +176,7 @@ def convert(obj: Pandas) -> ps.pandas.Series | ps.pandas.DataFrame: DATA_ENGINES = { "numpy": NumpyEngine, - "pandas": PandasNumpyEngine, + "pandas": PandasEngine, "pandas-pyarrow": PandasPyarrowEngine, "polars": PolarsEngine, "polars-lazy": PolarsLazyEngine, diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index e52563aa0..8e061ecb5 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -50,18 +50,17 @@ from atom.basetransformer import BaseTransformer from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING -from atom.utils.patches import wrap_method_output from atom.utils.types import ( - Bins, Bool, CategoricalStrats, DataFrame, DiscretizerStrats, Engine, + Bins, Bool, CategoricalStrats, DiscretizerStrats, Engine, EngineTuple, Estimator, FloatLargerZero, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels, Sequence, - Tabular, Transformer, Verbose, XConstructor, YConstructor, sequence_t, EngineDataOptions + Pandas, Transformer, Verbose, XConstructor, YConstructor, sequence_t, EngineDataOptions ) from atom.utils.utils import ( Goal, check_is_fitted, composed, crash, get_col_order, get_cols, it, lst, make_sklearn, merge, method_to_log, n_cols, replace_missing, sign, to_df, - to_series, variable_return, wrap_transformer_methods, + to_series, variable_return, wrap_transformer_methods, to_tabular ) @@ -76,8 +75,8 @@ class TransformerMixin(BaseEstimator, BaseTransformer): - Accounts for the transformation of y. - Always add a fit method. - - Wraps the fit method with a data check. - - Wraps transforming methods with fit and data check. + - Wraps the fit method with attributes and a data check. + - Wraps transforming methods a data check. - Maintains internal attributes when cloned. """ @@ -87,10 +86,6 @@ def __init_subclass__(cls, **kwargs): for k in ("fit", "transform", "inverse_transform"): setattr(cls, k, wrap_transformer_methods(getattr(cls, k))) - # Patch to avoid errors for transformers that allow passing only y - with patch("sklearn.utils._set_output._wrap_method_output", wrap_method_output): - super().__init_subclass__(**kwargs) - def __repr__(self, N_CHAR_MAX: Int = 700) -> str: """Drop named tuples if default parameters from string representation.""" out = super().__repr__(N_CHAR_MAX) @@ -116,13 +111,7 @@ def __sklearn_clone__(self: T_Transformer) -> T_Transformer: return cloned - @composed(crash, method_to_log) - def fit( - self, - X: XConstructor | None = None, - y: YConstructor | None = None, - **fit_params, - ) -> Self: + def fit(self, X, y, **fit_params) -> Self: """Do nothing. Implemented for continuity of the API. @@ -131,10 +120,11 @@ def fit( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. - y: int, str, sequence, dataframe-like or None, default=None - Target column(s) corresponding to `X`. + y: dict, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. If None, `y` is + ignored. **fit_params Additional keyword arguments for the fit method. @@ -149,23 +139,18 @@ def fit( return self - @composed(crash, method_to_log) - def fit_transform( - self, - X: XConstructor | None = None, - y: YConstructor | None = None, - **fit_params, - ) -> Tabular | tuple[DataFrame, Tabular]: + def fit_transform(self, X, y, **fit_params) -> Pandas | tuple[pd.DataFrame, Pandas]: """Fit to data, then transform it. Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. - y: int, str, sequence, dataframe-like or None, default=None - Target column(s) corresponding to `X`. + y: dict, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. If None, `y` is + ignored. **fit_params Additional keyword arguments for the fit method. @@ -181,12 +166,7 @@ def fit_transform( """ return self.fit(X, y, **fit_params).transform(X, y) - @composed(crash, method_to_log) - def inverse_transform( - self, - X: XConstructor | None = None, - y: YConstructor | None = None, - ) -> Tabular | tuple[DataFrame, Tabular]: + def inverse_transform(self, X, y) -> Pandas | tuple[pd.DataFrame, Pandas]: """Do nothing. Returns the input unchanged. Implemented for continuity of the @@ -196,20 +176,11 @@ def inverse_transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. - y: int, str, sequence, dataframe-like or None, default=None - Target column(s) corresponding to `X`. - - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe-like: Target columns with shape=(n_samples, - n_targets) for multioutput tasks. + y: dict, sequence, dataframe-like or None, default=None + Target column(s) corresponding to `X`. If None, `y` is + ignored. Returns ------- @@ -220,10 +191,9 @@ def inverse_transform( Target column(s). Only returned if provided. """ - return variable_return(X, y) + return variable_return(self._convert(X), self._convert(y)) - @composed(crash, method_to_log) - def set_output(self, *, transform: EngineDataOptions | None = None): + def set_output(self, *, transform: EngineDataOptions | None = None) -> Self: """Set output container. See sklearn's [user guide][set_output] on how to use the @@ -381,8 +351,23 @@ def __init__( self.strategy = strategy self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Tabular = -1) -> Self: + def _log_changes(self, y: pd.Series): + """Print the changes per target class. + + Parameters + ---------- + y: pd.Series + Target column. + + """ + for key, value in self.mapping_.items(): + diff = self._counts[key] - np.sum(y == value) + if diff > 0: + self._log(f" --> Removing {diff} samples from class {key}.", 2) + elif diff < 0: + self._log(f" --> Adding {-diff} samples to class {key}.", 2) + + def fit(self, X: XConstructor, y: YConstructor) -> Self: """Fit to data. Parameters @@ -390,17 +375,8 @@ def fit(self, X: DataFrame, y: Tabular = -1) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict or sequence, default=-1 - Target column(s) corresponding to `X`. - - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. + y: dict or sequence + Target column corresponding to `X`. Returns ------- @@ -408,8 +384,13 @@ def fit(self, X: DataFrame, y: Tabular = -1) -> Self: Estimator instance. """ - if isinstance(y, pd.Series): - self.target_names_in_ = np.array([y.name]) + Xt = to_df(X, index=getattr(y, "index", None)) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + + if isinstance(yt, pd.Series): + self.target_names_in_ = np.array([yt.name]) else: raise ValueError("The Balancer class does not support multioutput tasks.") @@ -463,15 +444,14 @@ def fit(self, X: DataFrame, y: Tabular = -1) -> Self: for key, value in self.mapping_.items(): self._counts[key] = np.sum(y == value) - self._estimator = estimator.fit(X, y) + self._estimator = estimator.fit(Xt, yt) # Add the estimator as attribute to the instance setattr(self, f"{estimator.__class__.__name__.lower()}_", self._estimator) return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular = -1) -> tuple[DataFrame, Series]: + def transform(self, X: XConstructor, y: YConstructor) -> tuple[pd.DataFrame, pd.Series]: """Balance the data. Parameters @@ -479,12 +459,8 @@ def transform(self, X: DataFrame, y: Tabular = -1) -> tuple[DataFrame, Series]: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str or sequence, default=-1 - Target column(s) corresponding to `X`. - - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - Else: Array with shape=(n_samples,) to use as target. + y: dict or sequence + Target column corresponding to `X`. Returns ------- @@ -495,54 +471,57 @@ def transform(self, X: DataFrame, y: Tabular = -1) -> tuple[DataFrame, Series]: Transformed target column. """ + check_is_fitted(self) - def log_changes(y): - """Print the changes per target class.""" - for key, value in self.mapping_.items(): - diff = self._counts[key] - np.sum(y == value) - if diff > 0: - self._log(f" --> Removing {diff} samples from class {key}.", 2) - elif diff < 0: - self._log(f" --> Adding {-diff} samples to class {key}.", 2) + Xt = to_df( + data=X, + index=getattr(y, "index", None), + columns=getattr(self, "feature_names_in_", None), + ) + yt = to_tabular( + y, + index=getattr(Xt, "index", None), + columns=getattr(self, "target_names_in_", None), + ) if "over_sampling" in self._estimator.__module__: self._log(f"Oversampling with {self._estimator.__class__.__name__}...", 1) - index = X.index # Save indices for later reassignment - X, y = self._estimator.fit_resample(X, y) + index = Xt.index # Save indices for later reassignment + Xt, yt = self._estimator.fit_resample(Xt, yt) # Create indices for the new samples n_idx: list[int | str] if index.dtype.kind in "ifu": - n_idx = list(range(max(index) + 1, max(index) + len(X) - len(index) + 1)) + n_idx = list(range(max(index) + 1, max(index) + len(Xt) - len(index) + 1)) else: n_idx = [ f"{self._estimator.__class__.__name__.lower()}_{i}" - for i in range(1, len(X) - len(index) + 1) + for i in range(1, len(Xt) - len(index) + 1) ] # Assign the old + new indices - X.index = list(index) + list(n_idx) - y.index = list(index) + list(n_idx) + Xt.index = list(index) + list(n_idx) + yt.index = list(index) + list(n_idx) - log_changes(y) + self._log_changes(yt) elif "under_sampling" in self._estimator.__module__: self._log(f"Undersampling with {self._estimator.__class__.__name__}...", 1) - self._estimator.fit_resample(X, y) + self._estimator.fit_resample(Xt, yt) # Select chosen rows (imblearn doesn't return them in order) samples = sorted(self._estimator.sample_indices_) - X, y = X.iloc[samples], y.iloc[samples] # type: ignore[call-overload] + Xt, yt = Xt.iloc[samples], yt.iloc[samples] # type: ignore[call-overload] - log_changes(y) + self._log_changes(yt) elif "combine" in self._estimator.__module__: self._log(f"Balancing with {self._estimator.__class__.__name__}...", 1) - index = X.index - X_new, y_new = self._estimator.fit_resample(X, y) + index = Xt.index + X_new, y_new = self._estimator.fit_resample(Xt, yt) # Select rows kept by the undersampler if self._estimator.__class__.__name__ == "SMOTEENN": @@ -551,16 +530,16 @@ def log_changes(y): samples = sorted(self._estimator.tomek_.sample_indices_) # Select the remaining samples from the old dataframe - o_samples = [s for s in samples if s < len(X)] - X, y = X.iloc[o_samples], y.iloc[o_samples] # type: ignore[call-overload] + o_samples = [s for s in samples if s < len(Xt)] + Xt, yt = Xt.iloc[o_samples], yt.iloc[o_samples] # type: ignore[call-overload] # Create indices for the new samples if index.dtype.kind in "ifu": - n_idx = list(range(max(index) + 1, max(index) + len(X_new) - len(X) + 1)) + n_idx = list(range(max(index) + 1, max(index) + len(X_new) - len(Xt) + 1)) else: n_idx = [ f"{self._estimator.__class__.__name__.lower()}_{i}" - for i in range(1, len(X_new) - len(X) + 1) + for i in range(1, len(X_new) - len(Xt) + 1) ] # Select the new samples and assign the new indices @@ -576,13 +555,13 @@ def log_changes(y): # Then, output the samples dropped for key, value in self.mapping_.items(): - if (diff := self._counts[key] - np.sum(y == value)) > 0: + if (diff := self._counts[key] - np.sum(yt == value)) > 0: self._log(f" --> Removing {diff} samples from class: {key}.", 2) # Add the new samples to the old dataframe - X, y = pd.concat([X, X_new]), pd.concat([y, y_new]) + Xt, yt = pd.concat([Xt, X_new]), pd.concat([yt, y_new]) - return X, y + return self._convert(Xt), self._convert(yt) @beartype @@ -640,24 +619,12 @@ class Cleaner(TransformerMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": - - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -750,15 +717,14 @@ def __init__( self.drop_missing_target = drop_missing_target self.encode_target = encode_target - @composed(crash, method_to_log) - def fit(self, X: DataFrame | None = None, y: Tabular | None = None) -> Self: + def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: """Fit to data. Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. y: int, str, dict, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. @@ -846,19 +812,18 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> return np.array(columns) - @composed(crash, method_to_log) def transform( self, X: DataFrame | None = None, - y: Tabular | None = None, - ) -> Tabular | tuple[DataFrame, Tabular]: + y: Pandas | None = None, + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Apply the data cleaning steps to the data. Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. y: int, str, dict, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. @@ -964,12 +929,11 @@ def transform( return variable_return(X, y) - @composed(crash, method_to_log) def inverse_transform( self, X: DataFrame | None = None, - y: Tabular | None = None, - ) -> Tabular | tuple[DataFrame, Tabular]: + y: Pandas | None = None, + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Inversely transform the label encoding. This method only inversely transforms the target encoding. @@ -1172,8 +1136,7 @@ def __init__( self.sp = sp self.seasonal_model = seasonal_model - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -1241,8 +1204,7 @@ def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Decompose the data. Parameters @@ -1266,8 +1228,7 @@ def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: return X - @composed(crash, method_to_log) - def inverse_transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Inversely transform the data. Parameters @@ -1461,8 +1422,7 @@ def __init__( self.bins = bins self.labels = labels - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -1589,8 +1549,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Bin the data into intervals. Parameters @@ -1778,8 +1737,7 @@ def __init__( self.value = value self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Note that leaving y=None can lead to errors if the `strategy` @@ -1952,8 +1910,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> return get_col_order(cols, self.feature_names_in_, self._estimator.feature_names_in_) - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Encode the data. Parameters @@ -2185,8 +2142,7 @@ def __init__( self.max_nan_rows = max_nan_rows self.max_nan_cols = max_nan_cols - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -2312,12 +2268,11 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> [c for c in self.feature_names_in_ if c in self._estimator.get_feature_names_out()] ) - @composed(crash, method_to_log) def transform( self, X: DataFrame, - y: Tabular | None = None, - ) -> Tabular | tuple[DataFrame, Tabular]: + y: Pandas | None = None, + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Impute the missing values. Note that leaving y=None can lead to inconsistencies in @@ -2592,8 +2547,7 @@ def __init__( self.strategy = strategy self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -2652,8 +2606,7 @@ def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Apply the transformations to the data. Parameters @@ -2677,8 +2630,7 @@ def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: return X[self.feature_names_in_] - @composed(crash, method_to_log) - def inverse_transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Apply the inverse transformation to the data. Parameters @@ -2867,12 +2819,11 @@ def __init__( self.include_target = include_target self.kwargs = kwargs - @composed(crash, method_to_log) def transform( self, X: DataFrame, - y: Tabular | None = None, - ) -> Tabular | tuple[DataFrame, Tabular]: + y: Pandas | None = None, + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Apply the outlier strategy on the data. Parameters @@ -3145,8 +3096,7 @@ def __init__( self.include_binary = include_binary self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -3193,8 +3143,7 @@ def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Perform standardization by centering and scaling. Parameters @@ -3218,8 +3167,7 @@ def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: return X - @composed(crash, method_to_log) - def inverse_transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Apply the inverse transformation to the data. Parameters diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index fa5896d69..060afb4f4 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -33,10 +33,10 @@ from atom.basetransformer import BaseTransformer from atom.data_cleaning import Scaler, TransformerMixin from atom.utils.types import ( - Bool, DataFrame, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, + Bool, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, IntLargerEqualZero, IntLargerZero, NJobs, Operators, Scalar, Sequence, - Series, Tabular, Verbose, + Pandas, Verbose, ) from atom.utils.utils import ( Goal, Task, check_is_fitted, check_scaling, composed, crash, @@ -172,8 +172,7 @@ def __init__( self.drop_columns = drop_columns self.from_index = from_index - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Extract the new features. Parameters @@ -237,7 +236,7 @@ def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: continue # Skip if the resulting feature has zero variance min_val: int = 0 - max_val: Scalar | Series | None = None # None if isn't cyclic + max_val: Scalar | pd.Series | None = None # None if isn't cyclic if self.encoding_type == "cyclic": if fx == "microsecond": min_val, max_val = 0, 1e6 - 1 @@ -419,8 +418,7 @@ def __init__( self.operators = operators self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -508,8 +506,7 @@ def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: return self - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Generate new features. Parameters @@ -680,8 +677,7 @@ def __init__( self.operators = operators self.drop_columns = drop_columns - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Group features. Parameters @@ -1023,8 +1019,7 @@ def __init__( self.max_correlation = max_correlation self.kwargs = kwargs - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit the feature selector to the data. The univariate, sfm (when model is not fitted), sfs, rfe and @@ -1380,7 +1375,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): **kwargs, ) - self._estimator.fit(X, y) + self._estimator.fit(X, y) else: check_y() @@ -1477,8 +1472,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> ] ) - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Transform the data. Parameters diff --git a/atom/models/classreg.py b/atom/models/classreg.py index 87666cc16..df4129ef1 100644 --- a/atom/models/classreg.py +++ b/atom/models/classreg.py @@ -3081,7 +3081,7 @@ class XGBoost(BaseModel): } @property - def trials(self) -> pd.pd.DataFrame: + def trials(self) -> pd.DataFrame: """Overview of the trials' results. This property is only available for models that ran diff --git a/atom/nlp.py b/atom/nlp.py index 94dccdf39..103c548d9 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -28,7 +28,7 @@ from atom.data_cleaning import TransformerMixin from atom.utils.types import ( - Bool, DataFrame, Engine, FloatLargerZero, Sequence, Tabular, + Bool, Engine, FloatLargerZero, Sequence, Pandas, VectorizerStarts, Verbose, bool_t, ) from atom.utils.utils import ( @@ -193,8 +193,7 @@ def __init__( self.regex_number = regex_number self.drop_punctuation = drop_punctuation - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Apply the transformations to the data. Parameters @@ -444,8 +443,7 @@ def __init__( self.stem = stem self.lemmatize = lemmatize - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Normalize the text. Parameters @@ -664,8 +662,7 @@ def __init__( self.trigram_freq = trigram_freq self.quadgram_freq = quadgram_freq - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Tokenize the text. Parameters @@ -923,8 +920,7 @@ def _get_corpus_columns(self) -> list[str]: "The get_feature_names_out method is not available for strategy='hashing'." ) - @composed(crash, method_to_log) - def fit(self, X: DataFrame, y: Tabular | None = None) -> Self: + def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """Fit to data. Parameters @@ -994,8 +990,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> og_columns = [c for c in self.feature_names_in_ if c != self._corpus] return np.array(og_columns + self._get_corpus_columns()) - @composed(crash, method_to_log) - def transform(self, X: DataFrame, y: Tabular | None = None) -> DataFrame: + def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """Vectorize the text. Parameters diff --git a/atom/pipeline.py b/atom/pipeline.py index c5711c89c..7d0ac0b69 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -26,8 +26,8 @@ from typing_extensions import Self from atom.utils.types import ( - Bool, DataFrame, Estimator, FHConstructor, Float, Scalar, Sequence, - Tabular, Verbose, XConstructor, YConstructor, EngineDataOptions + Bool, Estimator, FHConstructor, Float, Scalar, Sequence, + Pandas, Verbose, XConstructor, YConstructor, EngineDataOptions ) from atom.utils.utils import ( NotFittedError, adjust_verbosity, check_is_fitted, fit_one, @@ -274,14 +274,14 @@ def _fit( X: XConstructor | None = None, y: YConstructor | None = None, routed_params: dict[str, Bunch] | None = None, - ) -> tuple[DataFrame | None, Tabular | None]: + ) -> tuple[pd.DataFrame | None, Pandas | None]: """Get data transformed through the pipeline. Parameters ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. None if the pipeline only uses y. + `X` is ignored. None if the pipeline only uses y. y: dict, sequence, dataframe or None, default=None Target column(s) corresponding to `X`. @@ -429,7 +429,7 @@ def fit( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. y: dict, sequence, dataframe or None, default=None Target column(s) corresponding to `X`. @@ -466,7 +466,7 @@ def fit_transform( X: XConstructor | None = None, y: YConstructor | None = None, **params, - ) -> Tabular | tuple[DataFrame, Tabular]: + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Fit the pipeline and transform the data. Call `fit` followed by `transform` on each transformer in the @@ -480,7 +480,7 @@ def fit_transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. None + `X` is ignored. None if the estimator only uses y. y: dict, sequence, dataframe or None, default=None @@ -525,7 +525,7 @@ def transform( *, filter_train_only: Bool = True, **params, - ) -> Tabular | tuple[DataFrame, Tabular]: + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Transform the data. Call `transform` on each transformer in the pipeline. The @@ -539,7 +539,7 @@ def transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. None if the pipeline only uses y. + `X` is ignored. None if the pipeline only uses y. y: dict, sequence, dataframe or None, default=None Target column(s) corresponding to `X`. @@ -587,7 +587,7 @@ def inverse_transform( *, filter_train_only: Bool = True, **params, - ) -> Tabular | tuple[DataFrame, Tabular]: + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Inverse transform for each step in a reverse order. All estimators in the pipeline must implement the @@ -597,7 +597,7 @@ def inverse_transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. None if the pipeline only uses y. + `X` is ignored. None if the pipeline only uses y. y: dict, sequence, dataframe or None, default=None Target column(s) corresponding to `X`. @@ -684,7 +684,7 @@ def predict( X: XConstructor | None = None, fh: FHConstructor | None = None, **params, - ) -> np.ndarray | Tabular: + ) -> np.ndarray | Pandas: """Transform, then predict of the final estimator. Parameters @@ -737,7 +737,7 @@ def predict_interval( X: XConstructor | None = None, *, coverage: Float | Sequence[Float] = 0.9, - ) -> Tabular: + ) -> Pandas: """Transform, then predict_quantiles of the final estimator. Parameters @@ -862,7 +862,7 @@ def predict_quantiles( X: XConstructor | None = None, *, alpha: Float | Sequence[Float] = (0.05, 0.95), - ) -> Tabular: + ) -> Pandas: """Transform, then predict_quantiles of the final estimator. Parameters @@ -895,7 +895,7 @@ def predict_residuals( self, y: YConstructor, X: XConstructor | None = None, - ) -> Tabular: + ) -> Pandas: """Transform, then predict_residuals of the final estimator. Parameters @@ -954,7 +954,6 @@ def predict_var( return self.steps[-1][1].predict_var(fh=fh, X=X, cov=cov) - @composed(crash, method_to_log) def set_output(self, *, transform: EngineDataOptions | None = None): """Set output container. @@ -989,7 +988,6 @@ def set_output(self, *, transform: EngineDataOptions | None = None): if transform is None: return self - super().set_output(transform=transform) self.engine = getattr(self, "engine", EngineTuple()).data = transform return self diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py index 215bb488d..2e87e07c4 100644 --- a/atom/plots/dataplot.py +++ b/atom/plots/dataplot.py @@ -30,8 +30,8 @@ from atom.plots.baseplot import BasePlot from atom.utils.constants import PALETTE from atom.utils.types import ( - Bool, ColumnSelector, DataFrame, Int, IntLargerZero, Legend, PACFMethods, - RowSelector, Segment, Sequence, Series, TargetSelector, + Bool, ColumnSelector, Int, IntLargerZero, Legend, PACFMethods, + RowSelector, Segment, Sequence, TargetSelector, ) from atom.utils.utils import ( check_dependency, crash, divide, get_corpus, has_task, lst, @@ -540,7 +540,7 @@ def plot_components( @crash def plot_correlation( self, - columns: Segment | Sequence[Int | str] | DataFrame | None = None, + columns: Segment | Sequence[Int | str] | pd.DataFrame | None = None, method: Literal["pearson", "kendall", "spearman"] = "pearson", *, title: str | dict[str, Any] | None = None, @@ -1862,7 +1862,7 @@ def plot_qq( @crash def plot_relationships( self, - columns: Segment | Sequence[Int | str] | DataFrame = (0, 1, 2), + columns: Segment | Sequence[Int | str] | pd.DataFrame = (0, 1, 2), *, title: str | dict[str, Any] | None = None, legend: Legend | dict[str, Any] | None = None, diff --git a/atom/utils/patches.py b/atom/utils/patches.py index ad592220c..ac770d9dc 100644 --- a/atom/utils/patches.py +++ b/atom/utils/patches.py @@ -36,24 +36,6 @@ # Functions ======================================================== >> -def wrap_method_output(f: Callable, method: str) -> Callable: - """Wrap sklearn's _wrap_method_output function. - - Custom implementation to avoid errors for transformers that allow - only providing `y`. Is used internally by _SetOutputMixin. - - """ - - @wraps(f) - def wrapper(self, *args, **kwargs): - try: - return _wrap_method_output(f, method)(self, *args, **kwargs) - except TypeError: - return f(self, *args, **kwargs) - - return wrapper - - def fit_and_score(*args, **kwargs) -> dict[str, Any]: """Wrap sklearn's _fit_and_score function. diff --git a/atom/utils/types.py b/atom/utils/types.py index da79e1f08..5a44217b0 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -136,11 +136,6 @@ class SPTuple(NamedTuple): trend_model: SeasonalityModels = "additive" -@runtime_checkable -class DataFrame(Protocol): - def __dataframe__(self, *args, **kwargs): ... - - @runtime_checkable class SkScorer(Protocol): """Protocol for sklearn's scorers.""" @@ -195,7 +190,7 @@ class Model(Protocol): # _metric: ClassMap # _ht: dict[str, Any] - def predict(self, *args, **kwargs) -> Tabular: ... + def predict(self, *args, **kwargs) -> Pandas: ... # Variable types for type hinting ================================== >> @@ -206,9 +201,7 @@ def predict(self, *args, **kwargs) -> Tabular: ... Float: TypeAlias = float | np.floating Scalar: TypeAlias = Int | Float Segment: TypeAlias = slice | range -Series: TypeAlias = pd.Series | md.Series | pl.Series | pa.Array Pandas: TypeAlias = pd.Series | pd.DataFrame -Tabular: TypeAlias = Series | DataFrame # Numerical types IntLargerZero: TypeAlias = Annotated[Int, Is[lambda x: x > 0]] @@ -228,7 +221,7 @@ def predict(self, *args, **kwargs) -> Tabular: ... | Iterable[Sequence[Any] | tuple[Hashable, Sequence[Any]] | dict[str, Sequence[Any]]] | np.ndarray | sps.spmatrix - | DataFrame + | pd.DataFrame ) XSelector: TypeAlias = XConstructor | Callable[..., XConstructor] YConstructor: TypeAlias = dict[str, Any] | Sequence[Any] | XConstructor @@ -236,11 +229,11 @@ def predict(self, *args, **kwargs) -> Tabular: ... FHConstructor: TypeAlias = Int | Sequence[Int] | ForecastingHorizon # Return types for transform methods -TReturn: TypeAlias = np.ndarray | sps.spmatrix | Sequence[Any] | DataFrame +TReturn: TypeAlias = np.ndarray | sps.spmatrix | Sequence[Any] | pd.DataFrame TReturns: TypeAlias = TReturn | tuple[TReturn, TReturn] # Selection of rows or columns by name or position -ColumnSelector: TypeAlias = Int | str | Segment | Sequence[Int | str] | DataFrame +ColumnSelector: TypeAlias = Int | str | Segment | Sequence[Int | str] | pd.DataFrame RowSelector: TypeAlias = Hashable | Sequence[Hashable] | ColumnSelector # Assignment of index or stratify parameter diff --git a/atom/utils/utils.py b/atom/utils/utils.py index a2163ec5c..f9c6be978 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -54,9 +54,9 @@ from atom.utils.constants import __version__ from atom.utils.types import ( - Bool, DataFrame, Estimator, FeatureNamesOut, Float, IndexSelector, Int, + Bool, Estimator, FeatureNamesOut, Float, IndexSelector, Int, IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar, - Scorer, Segment, Sequence, SPTuple, Tabular, Transformer, TReturn, + Scorer, Segment, Sequence, SPTuple, Pandas, Transformer, TReturn, TReturns, Verbose, XConstructor, XSelector, YConstructor, YSelector, int_t, segment_t, sequence_t, ) @@ -92,7 +92,7 @@ class Goal(Enum): regression = 1 forecast = 2 - def infer_task(self, y: Tabular) -> Task: + def infer_task(self, y: Pandas) -> Task: """Infer the task corresponding to a target column. Parameters @@ -244,7 +244,7 @@ class DataConfig: """ - index: bool = True + index: bool = False ignore: tuple[str, ...] = () sp: SPTuple = SPTuple() # noqa: RUF009 shuffle: Bool = False @@ -253,7 +253,7 @@ class DataConfig: test_size: Scalar = 0.2 holdout_size: Scalar | None = None - def get_stratify_columns(self, df: DataFrame, y: Tabular) -> DataFrame | None: + def get_stratify_columns(self, df: DataFrame, y: Pandas) -> DataFrame | None: """Get columns to stratify by. Parameters @@ -1510,7 +1510,7 @@ def get_segment(obj: list[T], segment: Segment) -> list[T]: return obj[slice(segment.start, segment.stop, segment.step)] -def is_sparse(obj: Tabular) -> bool: +def is_sparse(obj: Pandas) -> bool: """Check if the dataframe is sparse. A data set is considered sparse if any of its columns is sparse. @@ -1529,13 +1529,13 @@ def is_sparse(obj: Tabular) -> bool: return any(isinstance(col.dtype, pd.SparseDtype) for col in get_cols(obj)) -def check_empty(obj: Tabular) -> Tabular | None: +def check_empty(obj: Pandas) -> Pandas | None: """Check if a pandas object is empty. Parameters ---------- obj: series or dataframe - Tabular object to check. + Pandas object to check. Returns ------- @@ -1632,7 +1632,7 @@ def check_predict_proba(models: Model | Sequence[Model], method: str): ) -def check_scaling(X: Tabular) -> bool: +def check_scaling(X: Pandas) -> bool: """Check if the data is scaled. A data set is considered scaled when the mean of the mean of @@ -1933,14 +1933,14 @@ def to_tabular( data: YConstructor, index: Axes | None = ..., columns: str | Axes | None = ..., -) -> Tabular: ... +) -> Pandas: ... def to_tabular( data: YConstructor | None, index: Axes | None = None, columns: str | Axes | None = None, -) -> Tabular | None: +) -> Pandas | None: """Convert to a tabular pandas type. If the data is one-dimensional, convert to series, else to a @@ -1954,13 +1954,13 @@ def to_tabular( index: sequence, index or None, default=None Values for the index. - columns: sequence or None, default=None + columns: str, sequence or None, default=None Name of the columns. Use None for automatic naming. Returns ------- series, dataframe or None - Data as a Tabular object. + Data as a Pandas object. """ if (n_targets := n_cols(data)) == 1: @@ -2299,7 +2299,7 @@ def fit_one( X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. y: dict, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. @@ -2374,7 +2374,7 @@ def transform_one( X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. y: dict, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. @@ -2431,16 +2431,8 @@ def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: else: return out - Xt = to_df( - data=X, - index=getattr(y, "index", None), - columns=getattr(transformer, "feature_names_in_", None), - ) - yt = to_tabular( - y, - index=getattr(Xt, "index", None), - columns=getattr(transformer, "target_names_in_", None), - ) + Xt = to_df(X, index=getattr(y, "index", None)) + yt = to_tabular(y, index=getattr(Xt, "index", None)) use_y = True @@ -2474,12 +2466,11 @@ def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: X_new = prepare_df(out[0], Xt) y_new = to_tabular( data=out[1], - index=Xt.index, - columns=get_col_names(yt), + index=yt.index, ) if isinstance(yt, pd.DataFrame): y_new = prepare_df(y_new, yt) - elif "X" in params and X is not None and any(c in Xt for c in inc): + elif "X" in params and Xt is not None and any(c in Xt for c in inc): # X in -> X out X_new = prepare_df(out, Xt) y_new = yt if yt is None else yt.set_axis(X_new.index, axis=0) @@ -2487,7 +2478,6 @@ def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: y_new = to_tabular( data=out, index=yt.index, - columns=get_col_names(yt), ) X_new = Xt if Xt is None else Xt.set_index(y_new.index) if isinstance(yt, pd.DataFrame): @@ -2514,7 +2504,7 @@ def fit_transform_one( X: dataframe-like or None Feature set with shape=(n_samples, n_features). If None, - X is ignored. + `X` is ignored. y: dict, sequence, dataframe-like or None Target column(s) corresponding to `X`. @@ -2694,7 +2684,7 @@ def wrapper( X: XSelector | None = None, y: YSelector | None = None, **kwargs, - ) -> T_Transformer | Tabular | tuple[DataFrame, Tabular]: + ) -> T_Transformer | Pandas | tuple[pd.DataFrame, Pandas]: if f.__name__ == "fit": Xt = to_df(X, index=getattr(y, "index", None)) yt = to_tabular(y, index=getattr(Xt, "index", None)) diff --git a/docs_sources/scripts/autodocs.py b/docs_sources/scripts/autodocs.py index 6c56c1738..5ade3d5cc 100644 --- a/docs_sources/scripts/autodocs.py +++ b/docs_sources/scripts/autodocs.py @@ -627,7 +627,7 @@ def get_table(self, blocks: list) -> str: elif obj.__class__.__name__ == "cached_property": obj = obj.func - # Get the return type. Sometimes it returns a string 'Tabular' + # Get the return type. Sometimes it returns a string 'Pandas' # and sometimes a class pandas.DataFrame. Unclear why output = str(signature(obj).return_annotation) @@ -927,7 +927,7 @@ def types_conversion(dtype: str) -> str: "Pipeline": "[Pipeline][]", "collections.abc.Hashable": "str", "Scalar": "int | float", - "Tabular": "Series | DataFrame", + "Pandas": "Series | pd.DataFrame", "int | numpy.integer": "int", "float | numpy.floating": "float", "Series | modin.pandas.series.Series": "Series", diff --git a/docs_sources/user_guide/accelerating.md b/docs_sources/user_guide/accelerating.md index 0892be9ac..1860f1dc9 100644 --- a/docs_sources/user_guide/accelerating.md +++ b/docs_sources/user_guide/accelerating.md @@ -71,15 +71,15 @@ parameter, e.g. `#!python engine="pyarrow"` or `#!python engine={"data": "pyarro ATOM integrates the following data engines: - **numpy**: Transform the data to a [`numpy`](https://numpy.org/) array. -- **pandas**: Transform the data to [`pandas`](https://pandas.pydata.org/docs/index.html) with `numpy` backend. This - is the default engine and, in almost all cases, leaves the data unchanged. +- **pandas**: Leave the dataset as a [`pandas`](https://pandas.pydata.org/docs/index.html) object. This is the default + engine, that leaves the data unchanged. - **pandas-pyarrow**: Transform the data to [`pandas`](https://pandas.pydata.org/docs/user_guide/pyarrow.html) with the [`pyarrow`](https://arrow.apache.org/docs/python/index.html) backend. Read more in pandas' [user guide](https://pandas.pydata.org/docs/user_guide/pyarrow.html). - **polars**: The [polars](https://docs.pola.rs/) library is a blazingly fast dataframe library implemented in Rust and based on Apache Arrow. Transforms the data to a polars dataframe or series. - **polars-lazy**: This engine is similar to the `polars` engine, but it returns - a [pl.LazyFrame](https://docs.pola.rs/py-polars/html/reference/lazyframe/index.html) instead of a [pl.DataFrame](https://docs.pola.rs/py-polars/html/reference/dataframe/index.html). + a [pl.LazyFrame](https://docs.pola.rs/py-polars/html/reference/lazyframe/index.html) instead of a [pl.pd.DataFrame](https://docs.pola.rs/py-polars/html/reference/dataframe/index.html). - **pyarrow**: PyArrow is a cross-language, platform-independent, in-memory data format, that provides an efficient and fast way to serialize and deserialize data. the data is transformed to a [pa.Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html) or [pa.Array](https://arrow.apache.org/docs/python/generated/pyarrow.Array.html). diff --git a/tests/conftest.py b/tests/conftest.py index 8dd9b3e5b..673194cc9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,6 +11,8 @@ import numpy as np import pandas as pd +from ray.util.joblib import register_ray +import pyarrow as pa import pytest from sklearn.base import BaseEstimator from sklearn.datasets import ( @@ -31,7 +33,7 @@ from _pytest.monkeypatch import MonkeyPatch - from atom.utils.types import DataFrame, Sequence, Tabular, XSelector + from atom.utils.types import DataFrame, Sequence, Pandas, XSelector class DummyTransformer(TransformerMixin, BaseEstimator): @@ -113,10 +115,22 @@ def random(): return np.random.default_rng() +@pytest.fixture() +def ray(): + """Register ray as joblib backend. + + Although atom does this internally, it's skipped when ray is + mocked. Not registering it fails the call to joblib.parallel_config + in basetransformer.py. + + """ + register_ray() + + def get_train_test( X: XSelector | None, - y: Sequence[Any] | DataFrame, -) -> Tabular | tuple[Tabular, Tabular]: + y: Sequence[Any] | pd.DataFrame, +) -> Pandas | tuple[Pandas, Pandas]: """Get train and test sets from X and y. Parameters @@ -139,6 +153,7 @@ def get_train_test( if X is not None: return train_test_split( merge(to_df(X), to_tabular(y, columns=[f"y{i}" for i in range(n_cols(y))])), + shuffle=False, test_size=0.3, random_state=1, ) @@ -154,6 +169,9 @@ def get_train_test( X_class, y_class = load_wine(return_X_y=True, as_frame=True) X_reg, y_reg = load_diabetes(return_X_y=True, as_frame=True) +# Pyarrow dtypes +X_pa = X_bin.astype(pd.ArrowDtype(pa.float64())) + # Multilabel classification data X_label, y_label = make_multilabel_classification(n_samples=200, n_classes=4) diff --git a/tests/test_atom.py b/tests/test_atom.py index b59e857b1..dc69b8be6 100644 --- a/tests/test_atom.py +++ b/tests/test_atom.py @@ -9,6 +9,7 @@ from unittest.mock import MagicMock, patch import numpy as np +import pyarrow as pa import pandas as pd import pytest from category_encoders.target_encoder import TargetEncoder @@ -34,7 +35,7 @@ X10, DummyTransformer, X10_dt, X10_nan, X10_str, X10_str2, X20_out, X_bin, X_class, X_ex, X_label, X_reg, X_sparse, X_text, y10, y10_label, y10_label2, y10_sn, y10_str, y_bin, y_class, y_ex, y_fc, y_label, - y_multiclass, y_multireg, y_reg, + y_multiclass, y_multireg, y_reg, X_pa ) @@ -442,8 +443,8 @@ def test_shrink_dense2sparse(): def test_shrink_pyarrow(): - """Assert that it works with the pyarrow data backend.""" - atom = ATOMClassifier(X_bin, y_bin, engine={"data": "pyarrow"}, random_state=1) + """Assert that it works with pyarrow dtypes.""" + atom = ATOMClassifier(X_pa, y_bin, engine="pandas-pyarrow", random_state=1) assert atom.dtypes[0].name == "double[pyarrow]" atom.shrink() assert atom.dtypes[0].name == "float[pyarrow]" @@ -630,15 +631,15 @@ def test_add_keep_column_names(): assert atom.features.tolist() == ["x0", "x1", "x2", "x3"] # Transformer keeps rows equal - atom.add(DummyTransformer(strategy="equal"), get_feature_names_out=None) + atom.add(DummyTransformer(strategy="equal"), feature_names_out=None) assert atom.features.tolist() == ["x0", "x1", "x2", "x3"] # Transformer drops rows - atom.add(DummyTransformer(strategy="drop"), get_feature_names_out=None) + atom.add(DummyTransformer(strategy="drop"), feature_names_out=None) assert atom.features.tolist() == ["x0", "x2", "x3"] # Transformer adds a new column - atom.add(DummyTransformer(strategy="add"), columns="!x2", get_feature_names_out=None) + atom.add(DummyTransformer(strategy="add"), columns="!x2", feature_names_out=None) assert atom.features.tolist() == ["x0", "x2", "x3", "x4"] @@ -649,9 +650,9 @@ def test_raise_length_mismatch(): atom.prune(columns=[2, 4]) -def test_add_pyarrow_columns(): +def test_keep_pyarrow_dtypes(): """Assert that columns keep the pyarrow dtype.""" - atom = ATOMClassifier(X_bin, y_bin, engine="pyarrow", random_state=1) + atom = ATOMClassifier(X_pa, y_bin, random_state=1) assert isinstance(atom.dtypes[0], pd.ArrowDtype) atom.scale() assert isinstance(atom.dtypes[0], pd.ArrowDtype) diff --git a/tests/test_baserunner.py b/tests/test_baserunner.py index 4cf8dd589..80ca1a240 100644 --- a/tests/test_baserunner.py +++ b/tests/test_baserunner.py @@ -319,174 +319,6 @@ def test_results_property_train_sizing(): assert list(atom.results.index.get_level_values(0)) == [0.2, 0.4, 0.6, 0.8, 1.0] -# Test _check_input ============================================== >> - -def test_input_is_copied(): - """Assert that the data is copied.""" - X, y = BaseTransformer._check_input(X_bin, y_bin) - assert X is not X_bin - assert y is not y_bin - - -def test_input_X_and_y_None(): - """Assert that an error is raised when both X and y are None.""" - with pytest.raises(ValueError, match=".*both None.*"): - BaseTransformer._check_input() - - -def test_X_is_callable(): - """Assert that the data provided can be a callable.""" - X, _ = BaseTransformer._check_input(lambda: [[1, 2], [2, 1], [3, 1]]) - assert isinstance(X, pd.DataFrame) - - -def test_to_pandas(): - """Assert that the data provided is converted to pandas objects.""" - X, y = BaseTransformer._check_input(X_bin_array, y_bin_array) - assert isinstance(X, pd.DataFrame) - assert isinstance(y, pd.Series) - - -def test_column_order_is_retained(): - """Assert that column order is kept if column names are specified.""" - X_shuffled = X_bin[sample(list(X_bin.columns), X_bin.shape[1])] - X, _ = BaseTransformer._check_input(X_shuffled, columns=X_bin.columns) - assert list(X.columns) == list(X_bin.columns) - - -def test_incorrect_columns(): - """Assert that an error is raised when the provided columns do not match.""" - with pytest.raises(ValueError, match=".*features are different.*"): - BaseTransformer._check_input(X_bin, columns=["1", "2"]) - - -def test_input_data_in_atom(): - """Assert that the data does not change once in an atom pipeline.""" - atom = ATOMClassifier(X10, y10, random_state=1) - X10[3][2] = 99 # Change an item of the original variable - assert 99 not in atom.dataset # Is unchanged in the pipeline - - -def test_input_data_in_training(): - """Assert that the data does not change once in a training pipeline.""" - train = bin_train.copy() - trainer = DirectClassifier("LR", random_state=1) - trainer.run(train, bin_test) - train.iloc[3, 2] = 99 # Change an item of the original variable - assert 99 not in trainer.dataset # Is unchanged in the pipeline - - -def test_text_to_corpus(): - """Assert that for text data the column is named corpus.""" - atom = ATOMClassifier(X_text, y10, random_state=1) - assert atom.X.columns == ["corpus"] - - -def test_int_columns_to_str(): - """Assert that int columns are converted to str.""" - X = X_bin.copy() - X.columns = range(X.shape[1]) - atom = ATOMClassifier(X, y_bin, random_state=1) - assert atom.X.columns[0] == "0" - - -def test_duplicate_column_names_in_X(): - """Assert that an error is raised when X has duplicate column names.""" - X = merge(X_bin.copy(), pd.Series(1, name="mean texture")) - with pytest.raises(ValueError, match=".*column names found in X.*"): - ATOMClassifier(X, y_bin, random_state=1) - - -def test_sparse_matrices_X_y(): - """Assert that sparse matrices are accepted as (X, y) input.""" - atom = ATOMClassifier(X_sparse, y10, random_state=1) - assert isinstance(atom.X, pd.DataFrame) - assert atom.shape == (10, 4) - assert atom[atom.columns[0]].dtype.name == "Sparse[int64, 0]" - - -def test_sparse_matrices_2_tuples(): - """Assert that sparse matrices are accepted as 2-tuples input.""" - atom = ATOMClassifier((X_sparse, y10), (X_sparse, y10), random_state=1) - assert isinstance(atom.X, pd.DataFrame) - assert atom.shape == (20, 4) - assert atom[atom.columns[0]].dtype.name == "Sparse[int64, 0]" - - -def test_target_is_dict(): - """Assert that the target column is assigned correctly for a dict.""" - _, y = BaseTransformer._check_input(X10, {"a": [0] * 10}) - assert isinstance(y, pd.Series) - - -def test_multioutput_str(): - """Assert that multioutput can be assigned by column name.""" - X, y = BaseTransformer._check_input(X_bin, ["mean radius", "worst perimeter"]) - assert list(y.columns) == ["mean radius", "worst perimeter"] - - -def test_multioutput_int(): - """Assert that multioutput can be assigned by column position.""" - X, y = BaseTransformer._check_input(X_bin, [0, 2]) - assert list(y.columns) == ["mean radius", "mean perimeter"] - - -def test_equal_length(): - """Assert that an error is raised when X and y have unequal length.""" - with pytest.raises(ValueError, match=".*number of rows.*"): - BaseTransformer._check_input(X10, [312, 22]) - - -def test_equal_index(): - """Assert that an error is raised when X and y don't have same indices.""" - y = pd.Series(y_bin_array, index=range(10, len(y_bin_array) + 10)) - with pytest.raises(ValueError, match=".*same indices.*"): - BaseTransformer._check_input(X_bin, y) - - -def test_target_is_string(): - """Assert that the target column is assigned correctly for a string.""" - _, y = BaseTransformer._check_input(X_bin, y="mean radius") - assert y.name == "mean radius" - - -def test_target_not_in_dataset(): - """Assert that the target column given by y is in X.""" - with pytest.raises(ValueError, match=".*not found in X.*"): - BaseTransformer._check_input(X_bin, "X") - - -def test_X_is_None_with_str(): - """Assert that an error is raised when X is None and y is a string.""" - with pytest.raises(ValueError, match=".*can't be None when y is a str.*"): - BaseTransformer._check_input(y="test") - - -def test_target_is_int(): - """Assert that target column is assigned correctly for an integer.""" - _, y = BaseTransformer._check_input(X_bin, y=0) - assert y.name == "mean radius" - - -def test_X_is_None_with_int(): - """Assert that an error is raised when X is None and y is an int.""" - with pytest.raises(ValueError, match=".*can't be None when y is an int.*"): - BaseTransformer._check_input(y=1) - - -def test_target_is_none(): - """Assert that target column stays None when empty input.""" - _, y = BaseTransformer._check_input(X_bin, y=None) - assert y is None - - -def test_X_empty_df(): - """Assert that X becomes an empty dataframe when provided but in y.""" - X, y = BaseTransformer._check_input(y_fc, y=-1) - assert X.empty - assert isinstance(y, pd.Series) - - # Test _get_data =================================================== >> def test_index_is_true(): @@ -761,7 +593,7 @@ def test_input_is_train_test_with_parameter_y(): def test_input_is_train_test_for_forecast(): """Assert that input train, test works for forecast tasks.""" - trainer = DirectForecaster("ES", random_state=1) + trainer = DirectForecaster("Croston", random_state=1) trainer.run(fc_train, fc_test) assert_series_equal(trainer.y, pd.concat([fc_train, fc_test])) diff --git a/tests/test_basetrainer.py b/tests/test_basetrainer.py index d0e702be6..f78e3c8e7 100644 --- a/tests/test_basetrainer.py +++ b/tests/test_basetrainer.py @@ -14,7 +14,6 @@ from optuna.pruners import MedianPruner from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import f1_score, make_scorer - from atom import ATOMClassifier from atom.training import DirectClassifier, DirectRegressor @@ -377,7 +376,7 @@ def test_errors_keep(): @patch("atom.basetransformer.ray", MagicMock()) @patch("atom.basetrainer.ray", MagicMock()) -def test_parallel_with_ray(): +def test_parallel_with_ray(ray): """Assert that parallel runs successfully with ray backend.""" trainer = DirectClassifier( models=["LR", "LDA"], @@ -386,7 +385,7 @@ def test_parallel_with_ray(): backend="ray", random_state=1, ) - # Fails because MagicMock returns empty list + # Fails because MagicMock returns an empty list with pytest.raises(RuntimeError, match=".*All models failed.*"): trainer.run(bin_train, bin_test) diff --git a/tests/test_basetransformer.py b/tests/test_basetransformer.py index cdddef32b..6f6095a20 100644 --- a/tests/test_basetransformer.py +++ b/tests/test_basetransformer.py @@ -177,6 +177,174 @@ def test_device_id_invalid(): BaseTransformer(device="gpu:2,3") +# Test _check_input ============================================== >> + +def test_input_is_copied(): + """Assert that the data is copied.""" + X, y = BaseTransformer._check_input(X_bin, y_bin) + assert X is not X_bin + assert y is not y_bin + + +def test_input_X_and_y_None(): + """Assert that an error is raised when both X and y are None.""" + with pytest.raises(ValueError, match=".*both None.*"): + BaseTransformer._check_input() + + +def test_X_is_callable(): + """Assert that the data provided can be a callable.""" + X, _ = BaseTransformer._check_input(lambda: [[1, 2], [2, 1], [3, 1]]) + assert isinstance(X, pd.DataFrame) + + +def test_to_pandas(): + """Assert that the data provided is converted to pandas objects.""" + X, y = BaseTransformer._check_input(X_bin_array, y_bin_array) + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) + + +def test_column_order_is_retained(): + """Assert that column order is kept if column names are specified.""" + X_shuffled = X_bin[sample(list(X_bin.columns), X_bin.shape[1])] + X, _ = BaseTransformer._check_input(X_shuffled, columns=X_bin.columns) + assert list(X.columns) == list(X_bin.columns) + + +def test_incorrect_columns(): + """Assert that an error is raised when the provided columns do not match.""" + with pytest.raises(ValueError, match=".*features are different.*"): + BaseTransformer._check_input(X_bin, columns=["1", "2"]) + + +def test_input_data_in_atom(): + """Assert that the data does not change once in an atom pipeline.""" + atom = ATOMClassifier(X10, y10, random_state=1) + X10[3][2] = 99 # Change an item of the original variable + assert 99 not in atom.dataset # Is unchanged in the pipeline + + +def test_input_data_in_training(): + """Assert that the data does not change once in a training pipeline.""" + train = bin_train.copy() + trainer = DirectClassifier("LR", random_state=1) + trainer.run(train, bin_test) + train.iloc[3, 2] = 99 # Change an item of the original variable + assert 99 not in trainer.dataset # Is unchanged in the pipeline + + +def test_text_to_corpus(): + """Assert that for text data the column is named corpus.""" + atom = ATOMClassifier(X_text, y10, random_state=1) + assert atom.X.columns == ["corpus"] + + +def test_int_columns_to_str(): + """Assert that int columns are converted to str.""" + X = X_bin.copy() + X.columns = range(X.shape[1]) + atom = ATOMClassifier(X, y_bin, random_state=1) + assert atom.X.columns[0] == "0" + + +def test_duplicate_column_names_in_X(): + """Assert that an error is raised when X has duplicate column names.""" + X = merge(X_bin.copy(), pd.Series(1, name="mean texture")) + with pytest.raises(ValueError, match=".*column names found in X.*"): + ATOMClassifier(X, y_bin, random_state=1) + + +def test_sparse_matrices_X_y(): + """Assert that sparse matrices are accepted as (X, y) input.""" + atom = ATOMClassifier(X_sparse, y10, random_state=1) + assert isinstance(atom.X, pd.DataFrame) + assert atom.shape == (10, 4) + assert atom[atom.columns[0]].dtype.name == "Sparse[int64, 0]" + + +def test_sparse_matrices_2_tuples(): + """Assert that sparse matrices are accepted as 2-tuples input.""" + atom = ATOMClassifier((X_sparse, y10), (X_sparse, y10), random_state=1) + assert isinstance(atom.X, pd.DataFrame) + assert atom.shape == (20, 4) + assert atom[atom.columns[0]].dtype.name == "Sparse[int64, 0]" + + +def test_target_is_dict(): + """Assert that the target column is assigned correctly for a dict.""" + _, y = BaseTransformer._check_input(X10, {"a": [0] * 10}) + assert isinstance(y, pd.Series) + + +def test_multioutput_str(): + """Assert that multioutput can be assigned by column name.""" + X, y = BaseTransformer._check_input(X_bin, ["mean radius", "worst perimeter"]) + assert list(y.columns) == ["mean radius", "worst perimeter"] + + +def test_multioutput_int(): + """Assert that multioutput can be assigned by column position.""" + X, y = BaseTransformer._check_input(X_bin, [0, 2]) + assert list(y.columns) == ["mean radius", "mean perimeter"] + + +def test_equal_length(): + """Assert that an error is raised when X and y have unequal length.""" + with pytest.raises(ValueError, match=".*number of rows.*"): + BaseTransformer._check_input(X10, [312, 22]) + + +def test_equal_index(): + """Assert that an error is raised when X and y don't have same indices.""" + y = pd.Series(y_bin_array, index=range(10, len(y_bin_array) + 10)) + with pytest.raises(ValueError, match=".*same indices.*"): + BaseTransformer._check_input(X_bin, y) + + +def test_target_is_string(): + """Assert that the target column is assigned correctly for a string.""" + _, y = BaseTransformer._check_input(X_bin, y="mean radius") + assert y.name == "mean radius" + + +def test_target_not_in_dataset(): + """Assert that the target column given by y is in X.""" + with pytest.raises(ValueError, match=".*not found in X.*"): + BaseTransformer._check_input(X_bin, "X") + + +def test_X_is_None_with_str(): + """Assert that an error is raised when X is None and y is a string.""" + with pytest.raises(ValueError, match=".*can't be None when y is a str.*"): + BaseTransformer._check_input(y="test") + + +def test_target_is_int(): + """Assert that target column is assigned correctly for an integer.""" + _, y = BaseTransformer._check_input(X_bin, y=0) + assert y.name == "mean radius" + + +def test_X_is_None_with_int(): + """Assert that an error is raised when X is None and y is an int.""" + with pytest.raises(ValueError, match=".*can't be None when y is an int.*"): + BaseTransformer._check_input(y=1) + + +def test_target_is_none(): + """Assert that target column stays None when empty input.""" + _, y = BaseTransformer._check_input(X_bin, y=None) + assert y is None + + +def test_X_empty_df(): + """Assert that X becomes an empty dataframe when provided but in y.""" + X, y = BaseTransformer._check_input(y_fc, y=-1) + assert X.empty + assert isinstance(y, pd.Series) + + # Test _inherit ==================================================== >> def test_inherit(): From 9562c4925dce8da3d1cb27a8fbf71216a74aa941 Mon Sep 17 00:00:00 2001 From: Mavs Date: Thu, 15 Feb 2024 21:26:28 +0100 Subject: [PATCH 04/12] dataengines 5 --- atom/api.py | 26 +- atom/atom.py | 38 +- atom/basemodel.py | 34 +- atom/baserunner.py | 9 +- atom/basetransformer.py | 103 ++--- atom/branch/branch.py | 10 +- atom/branch/dataengines.py | 5 +- atom/data_cleaning.py | 815 ++++++++++++++++------------------ atom/feature_engineering.py | 246 +++++----- atom/models/classreg.py | 1 + atom/nlp.py | 176 ++++---- atom/pipeline.py | 7 +- atom/plots/dataplot.py | 6 +- atom/utils/patches.py | 2 - atom/utils/types.py | 9 +- atom/utils/utils.py | 145 ++---- tests/conftest.py | 20 +- tests/test_atom.py | 5 +- tests/test_baserunner.py | 12 +- tests/test_basetrainer.py | 3 +- tests/test_basetransformer.py | 8 +- tests/test_branch.py | 15 +- tests/test_nlp.py | 16 - 23 files changed, 778 insertions(+), 933 deletions(-) diff --git a/atom/api.py b/atom/api.py index e7a55e98e..31dc391c1 100644 --- a/atom/api.py +++ b/atom/api.py @@ -158,22 +158,20 @@ class ATOMClassifier(ATOM): **X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features). - **y: int, str, dict, sequence or dataframe**
+ **y: int, str, sequence or dataframe-like**
Target column(s) corresponding to `X`. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. - y: int, str, dict, sequence or dataframe, default=-1 + y: int, str, sequence or dataframe-like, default=-1 Target column(s) corresponding to `X`. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - If dataframe: Target columns for multioutput tasks. @@ -438,26 +436,24 @@ class ATOMForecaster(ATOM): Exogenous feature set corresponding to y, with shape=(n_samples, n_features). - **y: int, str, dict, sequence or dataframe**
+ **y: int, str, sequence or dataframe-like**
Time series. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. - y: int, str, dict, sequence or dataframe, default=-1 + y: int, str, sequence or dataframe-like, default=-1 Time series. - If None: `y` is ignored. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. This parameter is ignored if the time series is provided through `arrays`. @@ -709,26 +705,24 @@ class ATOMRegressor(ATOM): **X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features). - **y: int, str, dict, sequence or dataframe**
+ **y: int, str, sequence or dataframe-like**
Target column(s) corresponding to `X`. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - If dataframe: Target columns for multioutput tasks. - y: int, str, dict, sequence or dataframe, default=-1 + y: int, str, sequence or dataframe-like, default=-1 Target column(s) corresponding to `X`. - If None: `y` is ignored. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. This parameter is ignored if the target column is provided through `arrays`. diff --git a/atom/atom.py b/atom/atom.py index 5eac1fad2..56d8557df 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -24,9 +24,10 @@ from beartype import beartype from joblib.memory import Memory from pandas._typing import DtypeObj +from polars.dependencies import _lazy_import from sklearn.pipeline import Pipeline as SkPipeline from sklearn.utils.metaestimators import available_if -from polars.dependencies import _lazy_import + from atom.baserunner import BaseRunner from atom.basetransformer import BaseTransformer from atom.branch import Branch, BranchManager @@ -47,15 +48,15 @@ ) from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING, __version__ from atom.utils.types import ( - Backend, Bins, Bool, CategoricalStrats, ColumnSelector, - DiscretizerStrats, Engine, EngineTuple, Estimator, FeatureNamesOut, - FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero, - FloatLargerZero, FloatZeroToOneInc, IndexSelector, Int, IntLargerEqualZero, - IntLargerTwo, IntLargerZero, MetricConstructor, ModelsConstructor, NItems, - NJobs, NormalizerStrats, NumericalStrats, Operators, Predictor, + Backend, Bins, Bool, CategoricalStrats, ColumnSelector, DiscretizerStrats, + Engine, EngineTuple, Estimator, FeatureNamesOut, FeatureSelectionSolvers, + FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, + FloatZeroToOneInc, IndexSelector, Int, IntLargerEqualZero, IntLargerTwo, + IntLargerZero, MetricConstructor, ModelsConstructor, NItems, NJobs, + NormalizerStrats, NumericalStrats, Operators, Pandas, Predictor, PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality, Sequence, - SPDict, Pandas, TargetSelector, Transformer, VectorizerStarts, Verbose, - Warnings, XSelector, YSelector, sequence_t, + SPDict, TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings, + XSelector, YSelector, sequence_t, ) from atom.utils.utils import ( ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, @@ -683,20 +684,18 @@ def inverse_transform( Parameters ---------- - X: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, `X` is ignored in the transformers. - y: int, str, dict, sequence, dataframe or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Transformed target column corresponding to `X`. - If None: `y` is ignored. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If @@ -711,10 +710,10 @@ def inverse_transform( Original target column. Only returned if provided. """ - X, y = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) + Xt, yt = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) with adjust_verbosity(self.pipeline, verbose) as pipeline: - return self._convert(pipeline.inverse_transform(X, y)) + return self._convert(pipeline.inverse_transform(Xt, yt)) @classmethod def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM: @@ -1110,16 +1109,15 @@ def transform( Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: int, str, dict, sequence, dataframe or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - If None: `y` is ignored. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If @@ -1134,10 +1132,10 @@ def transform( Transformed target column. Only returned if provided. """ - X, y = self._check_input(X, y, columns=self.og.features, name=self.og.target) + Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) with adjust_verbosity(self.pipeline, verbose) as pipeline: - return self._convert(pipeline.transform(X, y)) + return self._convert(pipeline.transform(Xt, yt)) # Base transformers ============================================ >> diff --git a/atom/basemodel.py b/atom/basemodel.py index 327041c63..a655e6709 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -66,18 +66,18 @@ from atom.utils.constants import DF_ATTRS from atom.utils.patches import fit_and_score from atom.utils.types import ( - HT, Backend, Bool, Engine, FHConstructor, Float, - FloatZeroToOneExc, Int, IntLargerEqualZero, MetricConstructor, - MetricFunction, NJobs, Pandas, PredictionMethods, PredictionMethodsTS, - Predictor, RowSelector, Scalar, Scorer, Sequence, Stages, Pandas, - TargetSelector, Verbose, Warnings, XSelector, YSelector, float_t, int_t, + HT, Backend, Bool, Engine, FHConstructor, Float, FloatZeroToOneExc, Int, + IntLargerEqualZero, MetricConstructor, MetricFunction, NJobs, Pandas, + PredictionMethods, PredictionMethodsTS, Predictor, RowSelector, Scalar, + Scorer, Sequence, Stages, TargetSelector, Verbose, Warnings, XSelector, + YSelector, float_t, int_t, ) from atom.utils.utils import ( ClassMap, DataConfig, Goal, PlotCallback, ShapExplanation, Task, TrialsCallback, adjust_verbosity, cache, check_dependency, check_empty, - composed, crash, estimator_has_attr, flt, get_cols, get_custom_scorer, - has_task, it, lst, merge, method_to_log, rnd, sign, time_to_str, to_df, - to_series, to_tabular, get_col_names + composed, crash, estimator_has_attr, flt, get_col_names, get_cols, + get_custom_scorer, has_task, it, lst, merge, method_to_log, rnd, sign, + time_to_str, to_df, to_series, to_tabular, ) @@ -2254,16 +2254,15 @@ def inverse_transform( Transformed feature set with shape=(n_samples, n_features). If None, `X` is ignored in the transformers. - y: int, str, dict, sequence, dataframe or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - If None: `y` is ignored. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If @@ -2278,10 +2277,10 @@ def inverse_transform( Original target column. Only returned if provided. """ - X, y = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) + Xt, yt = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) with adjust_verbosity(self.pipeline, verbose) as pipeline: - return pipeline.inverse_transform(X, y) + return pipeline.inverse_transform(Xt, yt) @composed(crash, method_to_log, beartype) def register( @@ -2453,16 +2452,15 @@ def transform( `X` is ignored. If None, `X` is ignored in the transformers. - y: int, str, dict, sequence, dataframe or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - If None: `y` is ignored. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If @@ -2477,10 +2475,10 @@ def transform( Transformed target column. Only returned if provided. """ - X, y = self._check_input(X, y, columns=self.og.features, name=self.og.target) + Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) with adjust_verbosity(self.pipeline, verbose) as pipeline: - return pipeline.transform(X, y) + return pipeline.transform(Xt, yt) class ClassRegModel: diff --git a/atom/baserunner.py b/atom/baserunner.py index 01073859d..f83eed87b 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -15,7 +15,7 @@ from copy import deepcopy from functools import cached_property from pathlib import Path -from typing import Any, overload, Literal +from typing import Any import dill as pickle import numpy as np @@ -39,14 +39,13 @@ from atom.utils.types import ( Bool, FloatZeroToOneExc, HarmonicsSelector, IndexSelector, Int, IntLargerOne, MetricConstructor, Model, ModelSelector, ModelsSelector, - RowSelector, Seasonality, Segment, Sequence, SPDict, SPTuple, - Pandas, TargetSelector, XSelector, YSelector, bool_t, int_t, pandas_t, - segment_t, sequence_t, + Pandas, RowSelector, Seasonality, Segment, Sequence, SPDict, SPTuple, + TargetSelector, YSelector, bool_t, int_t, pandas_t, segment_t, sequence_t, ) from atom.utils.utils import ( ClassMap, DataContainer, Goal, SeasonalPeriod, Task, check_is_fitted, composed, crash, divide, flt, get_cols, get_segment, get_versions, - has_task, lst, merge, method_to_log, n_cols, to_df, to_tabular, + has_task, lst, merge, method_to_log, n_cols, ) diff --git a/atom/basetransformer.py b/atom/basetransformer.py index 067100c01..b66e0a203 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -12,6 +12,7 @@ import re import tempfile import warnings +from copy import deepcopy from datetime import datetime as dt from importlib import import_module from importlib.util import find_spec @@ -19,22 +20,24 @@ from multiprocessing import cpu_count from pathlib import Path from typing import Any, TypeVar, overload -from copy import deepcopy + +import joblib import numpy as np import pandas as pd import requests -from polars.dependencies import _lazy_import from beartype import beartype -import joblib from joblib.memory import Memory +from polars.dependencies import _lazy_import from sklearn.utils.validation import check_memory from atom.utils.types import ( Backend, Bool, Engine, EngineDataOptions, EngineEstimatorOptions, - EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Severity, - Verbose, Warnings, bool_t, Sequence, int_t + EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Sequence, + Severity, Verbose, Warnings, bool_t, int_t, +) +from atom.utils.utils import ( + check_dependency, crash, lst, make_sklearn, to_df, to_tabular, ) -from atom.utils.utils import check_dependency, crash, lst, make_sklearn, to_df, to_tabular mlflow, _ = _lazy_import("mlflow") @@ -399,8 +402,8 @@ def _check_input( ) -> tuple[pd.DataFrame | None, Pandas | None]: """Prepare the input data. - Convert X and y to pandas (if not already) and perform standard - compatibility checks (dimensions, length, indices, etc...). + Convert X and y to pandas and perform standard compatibility + checks (dimensions, length, indices, etc...). Parameters ---------- @@ -408,17 +411,16 @@ def _check_input( Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: int, str, dict, sequence, dataframe or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - If None: `y` is ignored. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. - - If dataframe: Target columns for multioutput tasks. + - If dataframe-like: Target columns for multioutput tasks. columns: sequence of str or None, default=None Column names for the feature set. If None, default names @@ -430,65 +432,46 @@ def _check_input( Returns ------- - dataframe or None - Feature dataset. Only returned if provided. + pd.DataFrame or None + Feature set. - series, dataframe or None + pd.Series, pd.DataFrame or None Target column(s) corresponding to `X`. """ - Xt: pd.DataFrame | None = None - yt: Pandas | None = None - if X is None and y is None: raise ValueError("X and y can't be both None!") - elif X is not None: + else: Xt = to_df(deepcopy(X() if callable(X) else X), columns=columns) - # If text dataset, change the name of the column to corpus - if list(Xt.columns) == ["x0"] and Xt[Xt.columns[0]].dtype == "object": - Xt = Xt.rename(columns={Xt.columns[0]: "corpus"}) - else: - # Convert all column names to str - Xt.columns = Xt.columns.astype(str) - - # No duplicate rows nor column names are allowed - if Xt.columns.duplicated().any(): - raise ValueError("Duplicate column names found in X.") - # Prepare target column if not isinstance(y, Int | str | None): - if isinstance(y, dict): - yt = to_tabular(deepcopy(y), index=getattr(Xt, "index", None), columns=name) + # If X and y have different number of rows, try multioutput + if Xt is not None and not isinstance(y, dict) and len(Xt) != len(y): + try: + targets: list[Hashable] = [] + for col in y: + if col in Xt.columns: + targets.append(col) + elif isinstance(col, int_t): + if -Xt.shape[1] <= col < Xt.shape[1]: + targets.append(Xt.columns[int(col)]) + else: + raise IndexError( + "Invalid value for the y parameter. Value " + f"{col} is out of range for data with " + f"{Xt.shape[1]} columns." + ) + + Xt, yt = Xt.drop(columns=targets), Xt[targets] + + except (TypeError, IndexError, KeyError): + raise ValueError( + "X and y don't have the same number of rows," + f" got len(X)={len(Xt)} and len(y)={len(y)}." + ) from None else: - # If X and y have different number of rows, try multioutput - if Xt is not None and len(Xt) != len(y): - try: - targets: list[Hashable] = [] - for col in y: - if col in Xt.columns: - targets.append(col) - elif isinstance(col, int_t): - if -Xt.shape[1] <= col < Xt.shape[1]: - targets.append(Xt.columns[int(col)]) - else: - raise IndexError( - "Invalid value for the y parameter. Value " - f"{col} is out of range for data with " - f"{Xt.shape[1]} columns." - ) - - Xt, yt = Xt.drop(columns=targets), Xt[targets] - - except (TypeError, IndexError, KeyError): - raise ValueError( - "X and y don't have the same number of rows," - f" got len(X)={len(Xt)} and len(y)={len(y)}." - ) from None - else: - yt = y - - yt = to_tabular(deepcopy(yt), index=getattr(Xt, "index", None), columns=name) + yt = to_tabular(deepcopy(y), index=getattr(Xt, "index", None), columns=name) # Check X and y have the same indices if Xt is not None and not Xt.index.equals(yt.index): @@ -509,6 +492,8 @@ def _check_input( raise ValueError("X can't be None when y is an int.") Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]] + else: + yt = y return Xt, yt diff --git a/atom/branch/branch.py b/atom/branch/branch.py index 3a501e841..f37828a97 100644 --- a/atom/branch/branch.py +++ b/atom/branch/branch.py @@ -13,19 +13,19 @@ from pathlib import Path from typing import Literal, overload from warnings import filterwarnings -from polars.dependencies import _lazy_import import pandas as pd from beartype import beartype from beartype.roar import BeartypeDecorHintPep585DeprecationWarning from joblib.memory import Memory +from polars.dependencies import _lazy_import from sklearn.utils.validation import check_memory from atom.pipeline import Pipeline from atom.utils.types import ( - Bool, ColumnSelector, Int, IntLargerEqualZero, Pandas, - RowSelector, Scalar, Sequence, TargetSelector, TargetsSelector, - XConstructor, XSelector, YSelector, int_t, segment_t, + Bool, ColumnSelector, Int, IntLargerEqualZero, Pandas, RowSelector, Scalar, + Sequence, TargetSelector, TargetsSelector, XConstructor, XSelector, + YSelector, int_t, segment_t, ) from atom.utils.utils import ( DataContainer, check_scaling, flt, get_col_names, get_cols, lst, merge, @@ -33,7 +33,7 @@ ) -pickle = _lazy_import("dill") +pickle, _ = _lazy_import("dill") filterwarnings("ignore", category=BeartypeDecorHintPep585DeprecationWarning) diff --git a/atom/branch/dataengines.py b/atom/branch/dataengines.py index cebe51f93..a72a558a9 100644 --- a/atom/branch/dataengines.py +++ b/atom/branch/dataengines.py @@ -7,18 +7,17 @@ from __future__ import annotations +import os from abc import ABCMeta, abstractmethod -from polars.dependencies import _lazy_import import numpy as np import pandas as pd import polars as pl +from polars.dependencies import _lazy_import from atom.utils.types import Any, Pandas, Sequence from atom.utils.utils import get_cols -import os - os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index 8e061ecb5..a7da4778a 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -11,28 +11,12 @@ from collections import defaultdict from collections.abc import Hashable from typing import Any, Literal, TypeVar -from unittest.mock import patch import numpy as np import pandas as pd import sklearn from beartype import beartype -from category_encoders import ( - BackwardDifferenceEncoder, BaseNEncoder, BinaryEncoder, CatBoostEncoder, - HelmertEncoder, JamesSteinEncoder, MEstimateEncoder, OneHotEncoder, - OrdinalEncoder, PolynomialEncoder, SumEncoder, TargetEncoder, WOEEncoder, -) -from imblearn.combine import SMOTEENN, SMOTETomek -from imblearn.over_sampling import ( - ADASYN, SMOTE, SMOTEN, SMOTENC, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE, - RandomOverSampler, -) -from imblearn.under_sampling import ( - AllKNN, CondensedNearestNeighbour, EditedNearestNeighbours, - InstanceHardnessThreshold, NearMiss, NeighbourhoodCleaningRule, - OneSidedSelection, RandomUnderSampler, RepeatedEditedNearestNeighbours, - TomekLinks, -) +from polars.dependencies import _lazy_import from scipy.stats import zscore from sklearn.base import ( BaseEstimator, OneToOneFeatureMixin, _clone_parametrized, @@ -40,30 +24,31 @@ from sklearn.compose import ColumnTransformer from sklearn.experimental import enable_iterative_imputer # noqa: F401 from sklearn.impute import IterativeImputer, KNNImputer -from sklearn.utils._set_output import _SetOutputMixin from sklearn.utils.validation import _check_feature_names_in -from sktime.transformations.series.detrend import ( - ConditionalDeseasonalizer, Deseasonalizer, Detrender, -) -from sktime.transformations.series.impute import Imputer as sktimeImputer from typing_extensions import Self from atom.basetransformer import BaseTransformer from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING from atom.utils.types import ( Bins, Bool, CategoricalStrats, DiscretizerStrats, Engine, - EngineTuple, Estimator, FloatLargerZero, Int, IntLargerEqualZero, - IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, - Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels, Sequence, - Pandas, Transformer, Verbose, XConstructor, YConstructor, sequence_t, EngineDataOptions + EngineDataOptions, EngineTuple, Estimator, FloatLargerZero, Int, + IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, + NumericalStrats, Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, + SeasonalityModels, Sequence, Transformer, Verbose, XConstructor, + YConstructor, sequence_t, ) from atom.utils.utils import ( - Goal, check_is_fitted, composed, crash, get_col_order, get_cols, it, lst, - make_sklearn, merge, method_to_log, n_cols, replace_missing, sign, to_df, - to_series, variable_return, wrap_transformer_methods, to_tabular + Goal, check_is_fitted, get_col_names, get_col_order, get_cols, it, lst, + make_sklearn, merge, n_cols, replace_missing, sign, to_df, to_series, + to_tabular, variable_return, ) +category_encoders, _ = _lazy_import("category_encoders") +imblearn, _ = _lazy_import("imblearn") +sktime, _ = _lazy_import("sktime") + + T_Transformer = TypeVar("T_Transformer", bound=Transformer) @@ -81,11 +66,6 @@ class TransformerMixin(BaseEstimator, BaseTransformer): """ - def __init_subclass__(cls, **kwargs): - """Wrap transformer methods to apply data and fit check.""" - for k in ("fit", "transform", "inverse_transform"): - setattr(cls, k, wrap_transformer_methods(getattr(cls, k))) - def __repr__(self, N_CHAR_MAX: Int = 700) -> str: """Drop named tuples if default parameters from string representation.""" out = super().__repr__(N_CHAR_MAX) @@ -111,7 +91,7 @@ def __sklearn_clone__(self: T_Transformer) -> T_Transformer: return cloned - def fit(self, X, y, **fit_params) -> Self: + def fit(self, X=None, y=None, **fit_params) -> Self: """Do nothing. Implemented for continuity of the API. @@ -122,7 +102,7 @@ def fit(self, X, y, **fit_params) -> Self: Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. If None, `y` is ignored. @@ -135,11 +115,21 @@ def fit(self, X, y, **fit_params) -> Self: Estimator instance. """ + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + self._log(f"Fitting {self.__class__.__name__}...", 1) return self - def fit_transform(self, X, y, **fit_params) -> Pandas | tuple[pd.DataFrame, Pandas]: + def fit_transform( + self, + X: XConstructor | None = None, + y: YConstructor | None = None, + **fit_params, + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Fit to data, then transform it. Parameters @@ -148,7 +138,7 @@ def fit_transform(self, X, y, **fit_params) -> Pandas | tuple[pd.DataFrame, Pand Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. If None, `y` is ignored. @@ -166,7 +156,12 @@ def fit_transform(self, X, y, **fit_params) -> Pandas | tuple[pd.DataFrame, Pand """ return self.fit(X, y, **fit_params).transform(X, y) - def inverse_transform(self, X, y) -> Pandas | tuple[pd.DataFrame, Pandas]: + def inverse_transform( + self, + X: XConstructor | None = None, + y: YConstructor | None = None, + **fit_params, + ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Do nothing. Returns the input unchanged. Implemented for continuity of the @@ -178,7 +173,7 @@ def inverse_transform(self, X, y) -> Pandas | tuple[pd.DataFrame, Pandas]: Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. If None, `y` is ignored. @@ -191,7 +186,12 @@ def inverse_transform(self, X, y) -> Pandas | tuple[pd.DataFrame, Pandas]: Target column(s). Only returned if provided. """ - return variable_return(self._convert(X), self._convert(y)) + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + yt = to_tabular(y, index=Xt.index, columns=getattr(y, "target_names_in_", None)) + + return variable_return(self._convert(Xt), self._convert(yt)) def set_output(self, *, transform: EngineDataOptions | None = None) -> Self: """Set output container. @@ -232,7 +232,7 @@ def set_output(self, *, transform: EngineDataOptions | None = None) -> Self: @beartype -class Balancer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Balancer(TransformerMixin, OneToOneFeatureMixin): """Balance the number of samples per class in the target column. When oversampling, the newly created samples have an increasing @@ -375,7 +375,7 @@ def fit(self, X: XConstructor, y: YConstructor) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: dict or sequence + y: sequence Target column corresponding to `X`. Returns @@ -384,8 +384,9 @@ def fit(self, X: XConstructor, y: YConstructor) -> Self: Estimator instance. """ - Xt = to_df(X, index=getattr(y, "index", None)) - yt = to_tabular(y, index=getattr(Xt, "index", None)) + Xt = to_df(X) + yt = to_tabular(y, index=Xt.index) + self._check_feature_names(Xt, reset=True) self._check_n_features(Xt, reset=True) @@ -395,27 +396,27 @@ def fit(self, X: XConstructor, y: YConstructor) -> Self: raise ValueError("The Balancer class does not support multioutput tasks.") strategies = { - # clustercentroids=ClusterCentroids, # noqa: ERA001 (has no sample_indices_) - "condensednearestneighbour": CondensedNearestNeighbour, - "editednearestneighborus": EditedNearestNeighbours, - "repeatededitednearestneighbours": RepeatedEditedNearestNeighbours, - "allknn": AllKNN, - "instancehardnessthreshold": InstanceHardnessThreshold, - "nearmiss": NearMiss, - "neighbourhoodcleaningrule": NeighbourhoodCleaningRule, - "onesidedselection": OneSidedSelection, - "randomundersampler": RandomUnderSampler, - "tomeklinks": TomekLinks, - "randomoversampler": RandomOverSampler, - "smote": SMOTE, - "smotenc": SMOTENC, - "smoten": SMOTEN, - "adasyn": ADASYN, - "borderlinesmote": BorderlineSMOTE, - "kmeanssmote": KMeansSMOTE, - "svmsmote": SVMSMOTE, - "smoteenn": SMOTEENN, - "smotetomek": SMOTETomek, + # clustercentroids=imblearn.under_sampling.ClusterCentroids, # noqa: ERA001 (has no sample_indices_) + "condensednearestneighbour": imblearn.under_sampling.CondensedNearestNeighbour, + "editednearestneighborus": imblearn.under_sampling.EditedNearestNeighbours, + "repeatededitednearestneighbours": imblearn.under_sampling.RepeatedEditedNearestNeighbours, + "allknn": imblearn.under_sampling.AllKNN, + "instancehardnessthreshold": imblearn.under_sampling.InstanceHardnessThreshold, + "nearmiss": imblearn.under_sampling.NearMiss, + "neighbourhoodcleaningrule": imblearn.under_sampling.NeighbourhoodCleaningRule, + "onesidedselection": imblearn.under_sampling.OneSidedSelection, + "randomundersampler": imblearn.under_sampling.RandomUnderSampler, + "tomeklinks": imblearn.under_sampling.TomekLinks, + "randomoversampler": imblearn.over_sampling.RandomOverSampler, + "smote": imblearn.over_sampling.SMOTE, + "smotenc": imblearn.over_sampling.SMOTENC, + "smoten": imblearn.over_sampling.SMOTEN, + "adasyn": imblearn.over_sampling.ADASYN, + "borderlinesmote": imblearn.over_sampling.BorderlineSMOTE, + "kmeanssmote": imblearn.over_sampling.KMeansSMOTE, + "svmsmote": imblearn.over_sampling.SVMSMOTE, + "smoteenn": imblearn.combine.SMOTEENN, + "smotetomek": imblearn.combine.SMOTETomek, } if isinstance(self.strategy, str): @@ -459,7 +460,7 @@ def transform(self, X: XConstructor, y: YConstructor) -> tuple[pd.DataFrame, pd. X: dataframe-like Feature set with shape=(n_samples, n_features). - y: dict or sequence + y: sequence Target column corresponding to `X`. Returns @@ -473,16 +474,8 @@ def transform(self, X: XConstructor, y: YConstructor) -> tuple[pd.DataFrame, pd. """ check_is_fitted(self) - Xt = to_df( - data=X, - index=getattr(y, "index", None), - columns=getattr(self, "feature_names_in_", None), - ) - yt = to_tabular( - y, - index=getattr(Xt, "index", None), - columns=getattr(self, "target_names_in_", None), - ) + Xt = to_df(X, columns=self.feature_names_in_) + yt = to_tabular(y, index=Xt.index, columns=self.target_names_in_) if "over_sampling" in self._estimator.__module__: self._log(f"Oversampling with {self._estimator.__class__.__name__}...", 1) @@ -565,7 +558,7 @@ def transform(self, X: XConstructor, y: YConstructor) -> tuple[pd.DataFrame, pd. @beartype -class Cleaner(TransformerMixin, _SetOutputMixin): +class Cleaner(TransformerMixin): """Applies standard data cleaning steps on a dataset. Use the parameters to choose which transformations to perform. @@ -717,7 +710,7 @@ def __init__( self.drop_missing_target = drop_missing_target self.encode_target = encode_target - def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor | None = None, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -726,24 +719,21 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. - Returns ------- Self Estimator instance. """ + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + self.mapping_: dict[str, Any] = {} self._drop_cols = [] self._estimators = {} @@ -753,26 +743,23 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self: self._log("Fitting Cleaner...", 1) - if X is not None and self.drop_dtypes is not None: - self._drop_cols = list(X.select_dtypes(include=lst(self.drop_dtypes)).columns) + if Xt is not None and self.drop_dtypes is not None: + self._drop_cols = list(Xt.select_dtypes(include=lst(self.drop_dtypes)).columns) - if y is not None: - if isinstance(y, pd.Series): - self.target_names_in_ = np.array([y.name]) - else: - self.target_names_in_ = y.columns.to_numpy() + if yt is not None: + self.target_names_in_ = np.array(get_col_names(yt)) if self.drop_chars: - if isinstance(y, pd.Series): - y.name = re.sub(self.drop_chars, "", str(y.name)) + if isinstance(yt, pd.Series): + yt.name = re.sub(self.drop_chars, "", str(yt.name)) else: - y = y.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) + yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) if self.drop_missing_target: - y = replace_missing(y, self.missing_).dropna(axis=0) + yt = replace_missing(yt, self.missing_).dropna(axis=0) if self.encode_target: - for col in get_cols(y): + for col in get_cols(yt): if isinstance(col.iloc[0], sequence_t): # Multilabel MultiLabelBinarizer = self._get_est_class( name="MultiLabelBinarizer", @@ -814,8 +801,8 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> def transform( self, - X: DataFrame | None = None, - y: Pandas | None = None, + X: XConstructor | None = None, + y: YConstructor | None = None, ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Apply the data cleaning steps to the data. @@ -825,18 +812,9 @@ def transform( Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. - Returns ------- dataframe @@ -846,93 +824,102 @@ def transform( Transformed target column. Only returned if provided. """ + check_is_fitted(self) + + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + yt = to_tabular( + data=y, + index=getattr(Xt, "index", None), + columns=getattr(self, "target_names_in_", None), + ) + self._log("Cleaning the data...", 1) - if X is not None: + if Xt is not None: # Unify all missing values - X = replace_missing(X, self.missing_) + Xt = replace_missing(Xt, self.missing_) - for name, column in X.items(): + for name, column in Xt.items(): # Drop features with an invalid data type if name in self._drop_cols: self._log( f" --> Dropping feature {name} for " f"having type: {column.dtype.name}.", 2, ) - X = X.drop(columns=name) + Xt = Xt.drop(columns=name) elif column.dtype.name in CAT_TYPES: if self.strip_categorical: # Strip strings from blank spaces - X[name] = column.apply( + Xt[name] = column.apply( lambda val: val.strip() if isinstance(val, str) else val ) # Drop prohibited chars from column names if self.drop_chars: - X = X.rename(columns=lambda x: re.sub(self.drop_chars, "", str(x))) + Xt = Xt.rename(columns=lambda x: re.sub(self.drop_chars, "", str(x))) # Drop duplicate samples if self.drop_duplicates: - X = X.drop_duplicates(ignore_index=True) + Xt = Xt.drop_duplicates(ignore_index=True) if self.convert_dtypes: - X = X.convert_dtypes() + Xt = Xt.convert_dtypes() - if y is not None: + if yt is not None: if self.drop_chars: if isinstance(y, pd.Series): - y.name = re.sub(self.drop_chars, "", str(y.name)) + yt.name = re.sub(self.drop_chars, "", str(yt.name)) else: - y = y.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) + yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) # Delete samples with missing values in target if self.drop_missing_target: - length = len(y) # Save original length to count deleted rows later - y = replace_missing(y, self.missing_).dropna() + length = len(yt) # Save original length to count deleted rows later + yt = replace_missing(yt, self.missing_).dropna() - if X is not None: - X = X[X.index.isin(y.index)] # Select only indices that remain + if Xt is not None: + Xt = Xt[Xt.index.isin(yt.index)] # Select only indices that remain - if (d := length - len(y)) > 0: + if (d := length - len(yt)) > 0: self._log(f" --> Dropping {d} rows with missing values in target.", 2) if self.encode_target and self._estimators: - yt = y.__class__(dtype="object") - for col in get_cols(y): + y_new = yt.__class__(dtype="object") + for col in get_cols(yt): if est := self._estimators.get(col.name): if n_cols(out := est.transform(col)) == 1: self._log(f" --> Label-encoding column {col.name}.", 2) - out = to_series(out, y.index, col.name) + out = to_series(out, yt.index, col.name) else: self._log(f" --> Label-binarizing column {col.name}.", 2) out = to_df( data=out, - index=y.index, + index=yt.index, columns=[f"{col.name}_{c}" for c in est.classes_], ) # Replace target with encoded column(s) - if isinstance(y, pd.Series): - yt = out + if isinstance(yt, pd.Series): + y_new = out else: - yt = merge(yt, out) + y_new = merge(y_new, out) else: # Add unchanged column - yt = merge(yt, col) + y_new = merge(y_new, col) - y = yt + yt = y_new if self.convert_dtypes: - y = y.convert_dtypes() + yt = yt.convert_dtypes() - return variable_return(X, y) + return variable_return(self._convert(Xt), self._convert(yt)) def inverse_transform( self, - X: DataFrame | None = None, - y: Pandas | None = None, + X: XConstructor | None = None, + y: YConstructor | None = None, ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Inversely transform the label encoding. @@ -945,18 +932,9 @@ def inverse_transform( X: dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. - Returns ------- dataframe @@ -966,38 +944,43 @@ def inverse_transform( Original target column. Only returned if provided. """ + check_is_fitted(self) + + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + self._log("Inversely cleaning the data...", 1) - if y is not None and self._estimators: - yt = y.__class__(dtype="object") + if yt is not None and self._estimators: + y_new = yt.__class__(dtype="object") for col in self.target_names_in_: if est := self._estimators.get(col): if est.__class__.__name__ == "LabelEncoder": self._log(f" --> Inversely label-encoding column {col}.", 2) - out = est.inverse_transform(pd.DataFrame(y)[col]) + out = est.inverse_transform(pd.DataFrame(yt)[col]) - elif isinstance(y, pd.DataFrame): + elif isinstance(yt, pd.DataFrame): self._log(f" --> Inversely label-binarizing column {col}.", 2) out = est.inverse_transform( - y.loc[:, y.columns.str.startswith(f"{col}_")].to_numpy() + yt.loc[:, yt.columns.str.startswith(f"{col}_")].to_numpy() ) # Replace encoded columns with target column - if isinstance(y, pd.Series): - yt = to_series(out, y.index, col) + if isinstance(yt, pd.Series): + y_new = to_series(out, yt.index, col) else: - yt = merge(yt, to_series(out, y.index, col)) + y_new = merge(y_new, to_series(out, yt.index, col)) else: # Add unchanged column - yt = merge(yt, pd.DataFrame(y)[col]) + y_new = merge(y_new, pd.DataFrame(yt)[col]) - y = yt + yt = y_new - return variable_return(X, y) + return variable_return(self._convert(Xt), self._convert(yt)) @beartype -class Decomposer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Decomposer(TransformerMixin, OneToOneFeatureMixin): """Detrend and deseasonalize the time series. This class does two things: @@ -1136,7 +1119,7 @@ def __init__( self.sp = sp self.seasonal_model = seasonal_model - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -1144,7 +1127,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1155,6 +1138,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: """ from atom.models import MODELS + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + if isinstance(self.model, str): if self.model in MODELS: model = MODELS[self.model]( @@ -1183,19 +1171,19 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: self._log("Fitting Decomposer...", 1) self._estimators: dict[Hashable, tuple[Transformer, Transformer]] = {} - for name, column in X.select_dtypes(include="number").items(): - trend = Detrender( + for name, column in Xt.select_dtypes(include="number").items(): + trend = sktime.transformations.series.detrend.Detrender( forecaster=forecaster, model=self.trend_model, ).fit(column) if self.test_seasonality: - season = ConditionalDeseasonalizer( + season = sktime.transformations.series.detrend.ConditionalDeseasonalizer( sp=self.sp or 1, model=self.seasonal_model, ).fit(trend.transform(column)) else: - season = Deseasonalizer( + season = sktime.transformations.series.detrend.Deseasonalizer( sp=self.sp or 1, model=self.seasonal_model, ).fit(trend.transform(column)) @@ -1204,7 +1192,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: return self - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Decompose the data. Parameters @@ -1212,7 +1200,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1221,14 +1209,18 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Decomposing the data...", 1) for col, (trend, season) in self._estimators.items(): - X[col] = season.transform(trend.transform(X[col])) + Xt[col] = season.transform(trend.transform(Xt[col])) - return X + return self._convert(Xt) - def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Inversely transform the data. Parameters @@ -1236,7 +1228,7 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1245,16 +1237,20 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Original feature set. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Inversely decomposing the data...", 1) for col, (trend, season) in self._estimators.items(): - X[col] = trend.inverse_transform(season.inverse_transform(X[col])) + Xt[col] = trend.inverse_transform(season.inverse_transform(Xt[col])) - return X + return self._convert(Xt) @beartype -class Discretizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Discretizer(TransformerMixin, OneToOneFeatureMixin): """Bin continuous data into intervals. For each feature, the bin edges are computed during fit and, @@ -1312,24 +1308,12 @@ class Discretizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -1422,7 +1406,7 @@ def __init__( self.bins = bins self.labels = labels - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -1430,7 +1414,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1478,12 +1462,17 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: return labels + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + self._estimators: dict[str, Estimator] = {} self._labels: dict[str, Sequence[str]] = {} self._log("Fitting Discretizer...", 1) - for i, col in enumerate(X.select_dtypes(include="number")): + for i, col in enumerate(Xt.select_dtypes(include="number")): # Assign bins per column if isinstance(self.bins, dict): if col in self.bins: @@ -1519,7 +1508,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: encode="ordinal", strategy=self.strategy, **kwargs, - ).fit(X[[col]]) + ).fit(Xt[[col]]) # Save labels for transform method self._labels[col] = get_labels( @@ -1545,11 +1534,11 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: self._estimators[col] = FunctionTransformer( func=pd.cut, kw_args={"bins": bins_c, "labels": get_labels(col, bins_c)}, - ).fit(X[[col]]) + ).fit(Xt[[col]]) return self - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Bin the data into intervals. Parameters @@ -1557,7 +1546,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1566,25 +1555,29 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Binning the features...", 1) for col in self._estimators: if self.strategy == "custom": - X[col] = self._estimators[col].transform(X[col]) + Xt[col] = self._estimators[col].transform(Xt[col]) else: - X[col] = self._estimators[col].transform(X[[col]]).iloc[:, 0] + Xt[col] = self._estimators[col].transform(Xt[[col]]).iloc[:, 0] # Replace cluster values with labels for i, label in enumerate(self._labels[col]): - X[col] = X[col].replace(i, label) + Xt[col] = Xt[col].replace(i, label) - self._log(f" --> Discretizing feature {col} in {X[col].nunique()} bins.", 2) + self._log(f" --> Discretizing feature {col} in {Xt[col].nunique()} bins.", 2) - return X + return self._convert(Xt) @beartype -class Encoder(TransformerMixin, _SetOutputMixin): +class Encoder(TransformerMixin): """Perform encoding of categorical features. The encoding type depends on the number of classes in the column: @@ -1737,7 +1730,7 @@ def __init__( self.value = value self.kwargs = kwargs - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Note that leaving y=None can lead to errors if the `strategy` @@ -1749,18 +1742,9 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence or dataframe-like + y: sequence or dataframe-like Target column(s) corresponding to `X`. - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. - Returns ------- Self @@ -1772,20 +1756,26 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: self._categories = {} strategies = { - "backwarddifference": BackwardDifferenceEncoder, - "basen": BaseNEncoder, - "binary": BinaryEncoder, - "catboost": CatBoostEncoder, - "helmert": HelmertEncoder, - "jamesstein": JamesSteinEncoder, - "mestimate": MEstimateEncoder, - "ordinal": OrdinalEncoder, - "polynomial": PolynomialEncoder, - "sum": SumEncoder, - "target": TargetEncoder, - "woe": WOEEncoder, + "backwarddifference": category_encoders.BackwardDifferenceEncoder, + "basen": category_encoders.BaseNEncoder, + "binary": category_encoders.BinaryEncoder, + "catboost": category_encoders.CatBoostEncoder, + "helmert": category_encoders.HelmertEncoder, + "jamesstein": category_encoders.JamesSteinEncoder, + "mestimate": category_encoders.MEstimateEncoder, + "ordinal": category_encoders.OrdinalEncoder, + "polynomial": category_encoders.PolynomialEncoder, + "sum": category_encoders.SumEncoder, + "target": category_encoders.TargetEncoder, + "woe": category_encoders.WOEEncoder, } + Xt = to_df(X) + yt = to_tabular(y, index=Xt.index) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + if isinstance(self.strategy, str): if self.strategy.lower().endswith("encoder"): self.strategy = self.strategy[:-7] # Remove 'Encoder' at the end @@ -1818,12 +1808,12 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: encoders: dict[str, list[str]] = defaultdict(list) - for name, column in X.select_dtypes(include=CAT_TYPES).items(): + for name, column in Xt.select_dtypes(include=CAT_TYPES).items(): # Replace infrequent classes with the string in `value` if self.infrequent_to_value: values = column.value_counts() self._to_value[name] = values[values <= infrequent_to_value].index.tolist() - X[name] = column.replace(self._to_value[name], self.value) + Xt[name] = column.replace(self._to_value[name], self.value) # Get the unique categories before fitting self._categories[name] = column.dropna().sort_values().unique().tolist() @@ -1837,8 +1827,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: self._log( f" --> The number of classes passed to feature {name} in the " f"ordinal parameter ({len(ordinal_c)}) don't match the number " - f"of classes in the data ({column.nunique(dropna=True)}).", - 1, + f"of classes in the data ({column.nunique(dropna=True)}).", 1, severity="warning", ) @@ -1883,7 +1872,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: remainder="passthrough", n_jobs=self.n_jobs, verbose_feature_names_out=False, - ).fit(X, y) + ).fit(Xt, yt) return self @@ -1910,7 +1899,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> return get_col_order(cols, self.feature_names_in_, self._estimator.feature_names_in_) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Encode the data. Parameters @@ -1918,7 +1907,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1927,10 +1916,14 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Encoded dataframe. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Encoding categorical columns...", 1) # Convert infrequent classes to value - X = X.replace(self._to_value, self.value) + Xt = Xt.replace(self._to_value, self.value) for name, categories in self._categories.items(): if name in self._estimator.transformers_[0][2]: @@ -1942,24 +1935,24 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: self._log( f" --> {estimator.__class__.__name__[:-7]}-encoding feature " - f"{name}. Contains {X[name].nunique()} classes.", 2, + f"{name}. Contains {Xt[name].nunique()} classes.", 2, ) # Count the propagated missing values - if n_nans := X[name].isna().sum(): + if n_nans := Xt[name].isna().sum(): self._log(f" --> Propagating {n_nans} missing values.", 2) # Check for unknown classes - if uc := len(X[name].dropna()[~X[name].isin(categories)]): + if uc := len(Xt[name].dropna()[~Xt[name].isin(categories)]): self._log(f" --> Handling {uc} unknown classes.", 2) - Xt = self._estimator.transform(X) + Xt = self._estimator.transform(Xt) - return Xt[self.get_feature_names_out()] + return self._convert(Xt[self.get_feature_names_out()]) @beartype -class Imputer(TransformerMixin, _SetOutputMixin): +class Imputer(TransformerMixin): """Handle missing values in the data. Impute or remove missing values according to the selected strategy. @@ -2021,24 +2014,12 @@ class Imputer(TransformerMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -2142,7 +2123,7 @@ def __init__( self.max_nan_rows = max_nan_rows self.max_nan_cols = max_nan_cols - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -2150,7 +2131,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -2159,22 +2140,27 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: Estimator instance. """ + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + if not hasattr(self, "missing_"): self.missing_ = DEFAULT_MISSING self._log("Fitting Imputer...", 1) # Unify all values to impute - X = replace_missing(X, self.missing_) + Xt = replace_missing(Xt, self.missing_) if self.max_nan_rows is not None: if self.max_nan_rows <= 1: - self._max_nan_rows = int(X.shape[1] * self.max_nan_rows) + self._max_nan_rows = int(Xt.shape[1] * self.max_nan_rows) else: self._max_nan_rows = int(self.max_nan_rows) - X = X.dropna(axis=0, thresh=X.shape[1] - self._max_nan_rows) - if X.empty: + Xt = Xt.dropna(axis=0, thresh=Xt.shape[1] - self._max_nan_rows) + if Xt.empty: raise ValueError( "Invalid value for the max_nan_rows parameter, got " f"{self.max_nan_rows}. All rows contain more than " @@ -2184,11 +2170,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: if self.max_nan_cols is not None: if self.max_nan_cols <= 1: - max_nan_cols = int(X.shape[0] * self.max_nan_cols) + max_nan_cols = int(Xt.shape[0] * self.max_nan_cols) else: max_nan_cols = int(self.max_nan_cols) - X = X.drop(columns=X.columns[X.isna().sum() > max_nan_cols]) + Xt = Xt.drop(columns=Xt.columns[Xt.isna().sum() > max_nan_cols]) # Load the imputer class from sklearn or cuml (note the different modules) SimpleImputer = self._get_est_class( @@ -2208,7 +2194,8 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: elif self.strat_num == "drop": num_imputer = "passthrough" else: - num_imputer = make_sklearn(sktimeImputer)( + sktimeImputer = make_sklearn(sktime.transformations.series.impute.Imputer) + num_imputer = sktimeImputer( method=self.strat_num, missing_values=[pd.NA], random_state=self.random_state, @@ -2236,13 +2223,13 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: self._estimator = ColumnTransformer( transformers=[ - ("num_imputer", num_imputer, list(X.select_dtypes(include="number"))), - ("cat_imputer", cat_imputer, list(X.select_dtypes(include=CAT_TYPES))), + ("num_imputer", num_imputer, list(Xt.select_dtypes(include="number"))), + ("cat_imputer", cat_imputer, list(Xt.select_dtypes(include=CAT_TYPES))), ], remainder="passthrough", n_jobs=self.n_jobs, verbose_feature_names_out=False, - ).fit(X) + ).fit(Xt) return self @@ -2270,8 +2257,8 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> def transform( self, - X: DataFrame, - y: Pandas | None = None, + X: XConstructor, + y: YConstructor | None = None, ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Impute the missing values. @@ -2284,18 +2271,9 @@ def transform( X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. - Returns ------- dataframe @@ -2305,6 +2283,11 @@ def transform( Transformed target column. Only returned if provided. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + yt = to_tabular(y, index=Xt.index) + num_imputer = self._estimator.named_transformers_["num_imputer"] cat_imputer = self._estimator.named_transformers_["cat_imputer"] @@ -2313,52 +2296,49 @@ def transform( self._log("Imputing missing values...", 1) # Unify all values to impute - X = replace_missing(X, self.missing_) + Xt = replace_missing(Xt, self.missing_) # Drop rows with too many missing values if self.max_nan_rows is not None: - length = len(X) - X = X.dropna(axis=0, thresh=X.shape[1] - self._max_nan_rows) - if diff := length - len(X): + length = len(Xt) + Xt = Xt.dropna(axis=0, thresh=Xt.shape[1] - self._max_nan_rows) + if diff := length - len(Xt): self._log( f" --> Dropping {diff} samples for containing more " - f"than {self._max_nan_rows} missing values.", - 2, + f"than {self._max_nan_rows} missing values.", 2, ) if self.strat_num == "drop": - length = len(X) - X = X.dropna(subset=self._estimator.transformers_[0][2]) - if diff := length - len(X): + length = len(Xt) + Xt = Xt.dropna(subset=self._estimator.transformers_[0][2]) + if diff := length - len(Xt): self._log( f" --> Dropping {diff} samples for containing " - f"missing values in numerical columns.", - 2, + f"missing values in numerical columns.", 2, ) if self.strat_cat == "drop": - length = len(X) - X = X.dropna(subset=self._estimator.transformers_[1][2]) - if diff := length - len(X): + length = len(Xt) + Xt = Xt.dropna(subset=self._estimator.transformers_[1][2]) + if diff := length - len(Xt): self._log( f" --> Dropping {diff} samples for containing " - f"missing values in categorical columns.", - 2, + f"missing values in categorical columns.", 2, ) # Print imputation information per feature - for name, column in X.items(): + for name, column in Xt.items(): if nans := column.isna().sum(): # Drop columns with too many missing values if name not in self._estimator.feature_names_in_: self._log( f" --> Dropping feature {name}. Contains {nans} " - f"({nans * 100 // len(X)}%) missing values.", 2, + f"({nans * 100 // len(Xt)}%) missing values.", 2, ) - X = X.drop(columns=name) + Xt = Xt.drop(columns=name) continue - if self.strat_num != "drop" and name in num_imputer.feature_names_in_: + if name in getattr(num_imputer, "feature_names_in_", []): if not isinstance(self.strat_num, str): self._log( f" --> Imputing {nans} missing values with " @@ -2372,15 +2352,14 @@ def transform( elif self.strat_num in ("mean", "median", "most_frequent"): self._log( f" --> Imputing {nans} missing values with {self.strat_num} " - f"({np.round(get_stat(num_imputer, name), 2)}) in column " - f"{name}.", 2, + f"({np.round(get_stat(num_imputer, name), 2)}) in column {name}.", 2, ) else: self._log( f" --> Imputing {nans} missing values with {self.strat_num} " f"in column {name}.", 2, ) - elif self.strat_cat != "drop" and name in cat_imputer.feature_names_in_: + elif name in getattr(cat_imputer, "feature_names_in_", []): if self.strat_cat == "most_frequent": self._log( f" --> Imputing {nans} missing values with most_frequent " @@ -2392,20 +2371,20 @@ def transform( f"'{self.strat_cat}' in column {name}.", 2, ) - Xt = self._estimator.transform(X) + Xt = self._estimator.transform(Xt) # Make y consistent with X - if y is not None: - y = y[y.index.isin(Xt.index)] + if yt is not None: + yt = yt[yt.index.isin(Xt.index)] # Reorder columns to original order Xt = Xt[self.get_feature_names_out()] - return variable_return(Xt, y) + return variable_return(self._convert(Xt), self._convert(yt)) @beartype -class Normalizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Normalizer(TransformerMixin, OneToOneFeatureMixin): """Transform the data to follow a Normal/Gaussian distribution. This transformation is useful for modeling issues related to @@ -2442,24 +2421,12 @@ class Normalizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -2547,7 +2514,7 @@ def __init__( self.strategy = strategy self.kwargs = kwargs - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -2555,7 +2522,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -2570,6 +2537,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: "quantile": "QuantileTransformer", } + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + if self.strategy in ("yeojohnson", "boxcox"): estimator = self._get_est_class(strategies[self.strategy], "preprocessing") self._estimator = estimator( @@ -2590,7 +2562,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: f"Choose from: {', '.join(strategies)}." ) - num_cols = X.select_dtypes(include="number") + num_cols = Xt.select_dtypes(include="number") if num_cols.empty: raise ValueError( @@ -2606,7 +2578,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: return self - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Apply the transformations to the data. Parameters @@ -2614,7 +2586,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -2623,14 +2595,17 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Normalized dataframe. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Normalizing features...", 1) - Xt = self._estimator.transform(X[self._estimator.feature_names_in_]) - X.update(Xt) + Xt.update(self._estimator.transform(Xt[self._estimator.feature_names_in_])) - return X[self.feature_names_in_] + return self._convert(Xt) - def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Apply the inverse transformation to the data. Parameters @@ -2638,7 +2613,7 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -2647,17 +2622,25 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Original dataframe. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Inversely normalizing features...", 1) - Xt = self._estimator.inverse_transform(X[self._estimator.feature_names_in_]) - Xt = to_df(Xt, index=X.index, columns=self._estimator.feature_names_in_) - X.update(Xt) + Xt.update( + to_df( + data=self._estimator.inverse_transform(Xt[self._estimator.feature_names_in_]), + index=Xt.index, + columns=self._estimator.feature_names_in_, + ) + ) - return X + return self._convert(Xt) @beartype -class Pruner(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Pruner(TransformerMixin, OneToOneFeatureMixin): """Prune outliers from the data. Replace or remove outliers. The definition of outlier depends @@ -2712,25 +2695,12 @@ class Pruner(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "sklearn" (default) - - "sklearnex" - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -2821,8 +2791,8 @@ def __init__( def transform( self, - X: DataFrame, - y: Pandas | None = None, + X: XConstructor, + y: YConstructor | None = None, ) -> Pandas | tuple[pd.DataFrame, Pandas]: """Apply the outlier strategy on the data. @@ -2831,18 +2801,9 @@ def transform( X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, dict, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe: Target columns for multioutput tasks. - Returns ------- dataframe @@ -2852,6 +2813,9 @@ def transform( Transformed target column. Only returned if provided. """ + Xt = to_df(X) + yt = to_series(y, index=Xt.index) + # Estimators with their modules strategies = { "iforest": ["IsolationForest", "ensemble"], @@ -2890,7 +2854,7 @@ def transform( self._log("Pruning outliers...", 1) # Prepare dataset (merge with y and exclude categorical columns) - objective = merge(X, y) if self.include_target and y is not None else X + objective = merge(Xt, yt) if self.include_target and yt is not None else Xt objective = objective.select_dtypes(include=["number"]) outliers = [] @@ -2956,23 +2920,23 @@ def transform( self._log(f" --> Dropping {len(mask) - sum(mask)} outliers.", 2) # Keep only the non-outliers from the data - X = X[mask] - if y is not None: - y = y[mask] + Xt = Xt[mask] + if yt is not None: + yt = yt[mask] else: # Replace the columns in X and y with the new values from objective - X.update(objective) - if isinstance(y, pd.Series) and y.name in objective: - y.update(objective[str(y.name)]) - elif isinstance(y, pd.DataFrame): - y.update(objective) + Xt.update(objective) + if isinstance(yt, pd.Series) and yt.name in objective: + yt.update(objective[str(yt.name)]) + elif isinstance(yt, pd.DataFrame): + yt.update(objective) - return variable_return(X, y) + return variable_return(self._convert(Xt), self._convert(yt)) @beartype -class Scaler(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Scaler(TransformerMixin, OneToOneFeatureMixin): """Scale the data. Apply one of sklearn's scaling strategies. Categorical columns @@ -3001,24 +2965,12 @@ class Scaler(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -3096,7 +3048,7 @@ def __init__( self.include_binary = include_binary self.kwargs = kwargs - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -3104,7 +3056,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -3120,10 +3072,15 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: "robust": "RobustScaler", } - num_cols = X.select_dtypes(include="number") + Xt = to_df(X) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + + num_cols = Xt.select_dtypes(include="number") if not self.include_binary: - num_cols = X[[n for n, c in num_cols.items() if ~np.isin(c.unique(), [0, 1]).all()]] + num_cols = Xt[[n for n, c in num_cols.items() if ~np.isin(c.unique(), [0, 1]).all()]] if num_cols.empty: raise ValueError( @@ -3132,18 +3089,17 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: "non-binary columns when include_binary=False." ) - estimator = self._get_est_class(strategies[self.strategy], "preprocessing") - self._estimator = estimator(**self.kwargs) - self._log("Fitting Scaler...", 1) - self._estimator.fit(num_cols) + + estimator = self._get_est_class(strategies[self.strategy], "preprocessing") + self._estimator = estimator(**self.kwargs).fit(num_cols) # Add the estimator as attribute to the instance setattr(self, f"{self.strategy}_", self._estimator) return self - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Perform standardization by centering and scaling. Parameters @@ -3151,7 +3107,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -3160,14 +3116,17 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Scaled dataframe. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Scaling features...", 1) - Xt = self._estimator.transform(X[self._estimator.feature_names_in_]) - X.update(Xt) + Xt.update(self._estimator.transform(Xt[self._estimator.feature_names_in_])) - return X + return self._convert(Xt) - def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Apply the inverse transformation to the data. Parameters @@ -3175,7 +3134,7 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -3184,10 +3143,18 @@ def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Scaled dataframe. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Inversely scaling features...", 1) - Xt = self._estimator.inverse_transform(X[self._estimator.feature_names_in_]) - Xt = to_df(Xt, index=X.index, columns=self._estimator.feature_names_in_) - X.update(Xt) + Xt.update( + to_df( + data=self._estimator.inverse_transform(Xt[self._estimator.feature_names_in_]), + index=Xt.index, + columns=self._estimator.feature_names_in_, + ) + ) - return X + return self._convert(Xt) diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index 060afb4f4..c9a1fce81 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -16,6 +16,7 @@ import pandas as pd from beartype import beartype from gplearn.genetic import SymbolicTransformer +from polars.dependencies import _lazy_import from scipy import stats from sklearn.base import is_classifier from sklearn.feature_selection import ( @@ -25,25 +26,24 @@ from sklearn.model_selection import cross_val_score from sklearn.utils.validation import _check_feature_names_in from typing_extensions import Self -from zoofs import ( - DragonFlyOptimization, GeneticOptimization, GreyWolfOptimization, - HarrisHawkOptimization, ParticleSwarmOptimization, -) from atom.basetransformer import BaseTransformer from atom.data_cleaning import Scaler, TransformerMixin from atom.utils.types import ( Bool, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, - IntLargerEqualZero, IntLargerZero, NJobs, Operators, Scalar, Sequence, - Pandas, Verbose, + IntLargerEqualZero, IntLargerZero, NJobs, Operators, Pandas, Scalar, + Sequence, Verbose, XConstructor, YConstructor, ) from atom.utils.utils import ( - Goal, Task, check_is_fitted, check_scaling, composed, crash, - get_custom_scorer, is_sparse, lst, merge, method_to_log, sign, + Goal, Task, check_is_fitted, check_scaling, get_custom_scorer, is_sparse, + lst, merge, sign, to_df, to_tabular, ) +zoofs, _ = _lazy_import("zoofs") + + @beartype class FeatureExtractor(TransformerMixin): """Extract features from datetime columns. @@ -172,7 +172,7 @@ def __init__( self.drop_columns = drop_columns self.from_index = from_index - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Extract the new features. Parameters @@ -180,7 +180,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -189,19 +189,21 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + self._log("Extracting datetime features...", 1) if self.from_index: - if hasattr(X.index, "to_timestamp"): - Xc = pd.DataFrame(X.index.to_timestamp()) - order = Xc.columns.tolist() + X.columns.tolist() + if hasattr(Xt.index, "to_timestamp"): + Xc = pd.DataFrame(Xt.index.to_timestamp()) + order = Xc.columns.tolist() + Xt.columns.tolist() else: raise ValueError("Unable to convert the index to a timestamp format.") else: - Xc = X.select_dtypes(exclude="number") - order = X.columns.tolist() + Xc = Xt.select_dtypes(exclude="number") + order = Xt.columns.tolist() - Xt = pd.DataFrame(index=X.index) + X_new = pd.DataFrame(index=Xt.index) for name, column in Xc.items(): col_dt = pd.to_datetime( arg=column, @@ -259,13 +261,13 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: new_name = f"{name}_{fx}" if self.encoding_type == "ordinal" or max_val is None: self._log(f" --> Creating feature {new_name}.", 2) - Xt[new_name] = series.to_numpy() + X_new[new_name] = series.to_numpy() order.insert(order.index(name) + 1, new_name) elif self.encoding_type == "cyclic": self._log(f" --> Creating cyclic feature {new_name}.", 2) pos = 2 * np.pi * (series.to_numpy() - min_val) / np.array(max_val) - Xt[f"{new_name}_sin"] = np.sin(pos) - Xt[f"{new_name}_cos"] = np.cos(pos) + X_new[f"{new_name}_sin"] = np.sin(pos) + X_new[f"{new_name}_cos"] = np.cos(pos) order.insert(order.index(name) + 1, f"{new_name}_sin") order.insert(order.index(name) + 2, f"{new_name}_cos") @@ -273,7 +275,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: if self.drop_columns or self.from_index: order.remove(name) - return merge(Xt, X)[order] + return self._convert(merge(X_new, Xt)[order]) @beartype @@ -418,7 +420,7 @@ def __init__( self.operators = operators self.kwargs = kwargs - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -426,25 +428,21 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe-like: Target columns with shape=(n_samples, - n_targets) for multioutput tasks. - Returns ------- self Estimator instance. """ + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + all_operators = { "add": "add_numeric", "sub": "subtract_numeric", @@ -467,7 +465,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: if self.strategy == "dfs": # Run deep feature synthesis with transformation primitives - es = ft.EntitySet(dataframes={"X": (X, "_index", None, None, None, True)}) + es = ft.EntitySet(dataframes={"X": (Xt, "_index", None, None, None, True)}) self._dfs = ft.dfs( target_dataframe_name="X", entityset=es, @@ -478,7 +476,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: ) # Select the new features (dfs also returns originals) - self._dfs = self._dfs[X.shape[1] - 1:] + self._dfs = self._dfs[Xt.shape[1] - 1:] # Get a random selection of features if self.n_features and self.n_features < len(self._dfs): @@ -502,11 +500,11 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: n_jobs=kwargs.pop("n_jobs", self.n_jobs), random_state=kwargs.pop("random_state", self.random_state), **kwargs, - ).fit(X, y) + ).fit(Xt, yt) return self - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Generate new features. Parameters @@ -514,7 +512,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -523,10 +521,14 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Generating new features...", 1) if self.strategy == "dfs": - es = ft.EntitySet(dataframes={"X": (X, "index", None, None, None, True)}) + es = ft.EntitySet(dataframes={"X": (Xt, "index", None, None, None, True)}) dfs = ft.calculate_feature_matrix( features=self._dfs, entityset=es, @@ -534,7 +536,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: ) # Add the new features to the feature set - X = pd.concat([X, dfs], axis=1).set_index("index") + Xt = pd.concat([Xt, dfs], axis=1).set_index("index") self._log(f" --> {len(self._dfs)} new features were added.", 2) @@ -544,7 +546,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: data=[ ["", str(fx), fx.fitness_] for i, fx in enumerate(self.gfg_) - if str(fx) not in X.columns + if str(fx) not in Xt.columns ], columns=["name", "description", "fitness"], ) @@ -552,7 +554,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: # Check if any new features remain if len(df) == 0: self._log(" --> The genetic algorithm didn't find any improving features.", 2) - return X + return Xt # Select the n_features with the highest fitness df = df.drop_duplicates() @@ -562,17 +564,16 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: if len(df) != self.n_features: self._log( f" --> Dropping {(self.n_features or len(self.gfg_)) - len(df)} " - "features due to repetition.", - 2, + "features due to repetition.", 2, ) - for i, array in enumerate(self.gfg_.transform(X)[:, df.index].T): + for i, array in enumerate(self.gfg_.transform(Xt)[:, df.index].T): # If the column is new, use a default name counter = 0 while True: - name = f"x{X.shape[1] + counter}" + name = f"x{Xt.shape[1] + counter}" if name not in X: - X[name] = array # Add new feature to X + Xt[name] = array # Add new feature to X df.iloc[i, 0] = name break else: @@ -581,7 +582,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: self._log(f" --> {len(df)} new features were added.", 2) self.genetic_features_ = df.reset_index(drop=True) - return X + return self._convert(Xt) @beartype @@ -677,7 +678,7 @@ def __init__( self.operators = operators self.drop_columns = drop_columns - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Group features. Parameters @@ -685,7 +686,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -694,6 +695,8 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + self._log("Grouping features...", 1) if self.operators is None: @@ -705,10 +708,10 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: for name, group in self.groups.items(): for operator in operators: try: - result = X[group].apply(getattr(np, operator), axis=1) + result = Xt[group].apply(getattr(np, operator), axis=1) except AttributeError: try: - result = getattr(stats, operator)(X[group], axis=1)[0] + result = getattr(stats, operator)(Xt[group], axis=1)[0] except AttributeError: raise ValueError( "Invalid value for the operators parameter. Value " @@ -716,7 +719,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: ) from None try: - X[f"{operator}({name})"] = result + Xt[f"{operator}({name})"] = result except ValueError: raise ValueError( "Invalid value for the operators parameter. Value " @@ -727,9 +730,9 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: self._log(f" --> Group {name} successfully created.", 2) if self.drop_columns: - X = X.drop(columns=to_drop) + Xt = Xt.drop(columns=to_drop) - return X + return self._convert(Xt) @beartype @@ -896,25 +899,12 @@ class FeatureSelector(TransformerMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" - - - "estimator": - - - "sklearn" (default) - - "sklearnex" - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -1019,7 +1009,7 @@ def __init__( self.max_correlation = max_correlation self.kwargs = kwargs - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit the feature selector to the data. The univariate, sfm (when model is not fitted), sfs, rfe and @@ -1031,19 +1021,9 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - - If None: `y` is ignored. - - If int: Position of the target column in `X`. - - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - - If sequence: Target column with shape=(n_samples,) or - sequence of column names or positions for multioutput - tasks. - - If dataframe-like: Target columns with shape=(n_samples, - n_targets) for multioutput tasks. - Returns ------- self @@ -1069,6 +1049,12 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): model.fit(X_train, y_train) return scoring(model, X_valid, y_valid) + Xt = to_df(X) + tt = to_tabular(y, index=Xt.index) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) + self.collinear_ = pd.DataFrame(columns=["drop", "corr_feature", "corr_value"]) self.scaler_ = None @@ -1085,11 +1071,11 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): "sfs": "SequentialFeatureSelector", "rfe": "RFE", "rfecv": "RFECV", - "pso": ParticleSwarmOptimization, - "hho": HarrisHawkOptimization, - "gwo": GreyWolfOptimization, - "dfo": DragonFlyOptimization, - "go": GeneticOptimization, + "pso": zoofs.ParticleSwarmOptimization, + "hho": zoofs.HarrisHawkOptimization, + "gwo": zoofs.GreyWolfOptimization, + "dfo": zoofs.DragonFlyOptimization, + "go": zoofs.GeneticOptimization, } if isinstance(self.strategy, str): @@ -1153,9 +1139,9 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ) if self.n_features is None: - self._n_features = X.shape[1] + self._n_features = Xt.shape[1] elif self.n_features < 1: - self._n_features = int(self.n_features * X.shape[1]) + self._n_features = int(self.n_features * Xt.shape[1]) else: self._n_features = self.n_features @@ -1169,9 +1155,9 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): max_repeated: Scalar if self.max_repeated is None: - max_repeated = len(X) + max_repeated = len(Xt) elif self.max_repeated <= 1: - max_repeated = self.max_repeated * len(X) + max_repeated = self.max_repeated * len(Xt) else: max_repeated = int(self.max_repeated) @@ -1185,30 +1171,30 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): # Remove features with too high variance if self.min_repeated is not None: - for name, column in X.select_dtypes(exclude="number").items(): + for name, column in Xt.select_dtypes(exclude="number").items(): max_counts = column.value_counts() if min_repeated > max_counts.max(): self._high_variance[name] = (max_counts.idxmax(), max_counts.max()) - X = X.drop(columns=name) + Xt = Xt.drop(columns=name) break # Remove features with too low variance if self.max_repeated is not None: - for name, column in X.select_dtypes(exclude="number").items(): + for name, column in Xt.select_dtypes(exclude="number").items(): for category, count in column.value_counts().items(): if count >= max_repeated: - self._low_variance[name] = (category, 100.0 * count / len(X)) - X = X.drop(columns=name) + self._low_variance[name] = (category, 100.0 * count / len(Xt)) + Xt = Xt.drop(columns=name) break # Remove features with too high correlation self.collinear = pd.DataFrame(columns=["drop", "corr_feature", "corr_value"]) if self.max_correlation: # Get the Pearson correlation coefficient matrix - if y is None: - corr_X = X.corr() + if yt is None: + corr_X = Xt.corr() else: - corr_matrix = merge(X, y).corr() + corr_matrix = merge(Xt, yt).corr() corr_X, corr_y = corr_matrix.iloc[:-1, :-1], corr_matrix.iloc[:-1, -1] corr = {} @@ -1219,7 +1205,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): # Always finds himself with correlation 1 if len(corr[col]) > 1: - if y is None: + if yt is None: # Drop all but the first one to_drop.extend(list(corr[col][1:].index)) else: @@ -1244,7 +1230,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ignore_index=True, ) - X = X.drop(columns=self.collinear_["drop"].tolist()) + Xt = Xt.drop(columns=self.collinear_["drop"].tolist()) if self.strategy is None: return self # Exit feature_engineering @@ -1275,14 +1261,14 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): solver = self.solver check_y() - self._estimator = SelectKBest(solver, k=self._n_features).fit(X, y) + self._estimator = SelectKBest(solver, k=self._n_features).fit(Xt, yt) elif self.strategy == "pca": - if not is_sparse(X): + if not is_sparse(Xt): # PCA requires the features to be scaled - if not check_scaling(X): + if not check_scaling(Xt): self.scaler_ = Scaler() - X = self.scaler_.fit_transform(X) + Xt = self.scaler_.fit_transform(Xt) estimator = self._get_est_class("PCA", "decomposition") solver_param = "svd_solver" @@ -1298,11 +1284,11 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): # The PCA and TruncatedSVD both get all possible components to use # for the plots (n_components must be < n_features and <= n_rows) self._estimator = estimator( - n_components=min(len(X), X.shape[1] - 1), + n_components=min(len(Xt), Xt.shape[1] - 1), **{solver_param: solver}, random_state=self.random_state, **self.kwargs, - ).fit(X) + ).fit(Xt) self._estimator._comps = min(self._estimator.components_.shape[0], self._n_features) @@ -1324,7 +1310,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): **kwargs, ) if prefit: - if list(getattr(solver, "feature_names_in_", [])) != list(X.columns): + if list(getattr(solver, "feature_names_in_", [])) != list(Xt.columns): raise ValueError( "Invalid value for the solver parameter. The " f"{solver.__class__.__name__} estimator " @@ -1333,7 +1319,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): self._estimator.estimator_ = solver else: check_y() - self._estimator.fit(X, y) + self._estimator.fit(Xt, yt) elif self.strategy in ("sfs", "rfe", "rfecv"): if self.strategy == "sfs": @@ -1375,7 +1361,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): **kwargs, ) - self._estimator.fit(X, y) + self._estimator.fit(Xt, yt) else: check_y() @@ -1392,7 +1378,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): "cannot be absent when X_valid is provided." ) else: - X_valid, y_valid = X, y + X_valid, y_valid = Xt, yt # Get scoring for default objective_function if "objective_function" not in kwargs: @@ -1400,7 +1386,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): kwargs["scoring"] = get_custom_scorer(kwargs["scoring"]) else: goal = Goal(0) if is_classifier(solver) else Goal(1) - task = goal.infer_task(y) + task = goal.infer_task(yt) if task is Task.binary_classification: kwargs["scoring"] = get_custom_scorer("f1") elif task.is_multiclass: @@ -1416,8 +1402,8 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): self._estimator.fit( model=solver, - X_train=X, - y_train=y, + X_train=Xt, + y_train=yt, X_valid=X_valid, y_valid=y_valid, verbose=self.verbose >= 2, @@ -1472,7 +1458,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> ] ) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Transform the data. Parameters @@ -1480,7 +1466,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: X: dataframe-like Feature set with shape=(n_samples, n_features). - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1489,6 +1475,10 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed feature set. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Performing feature selection ...", 1) # Remove features with too high variance @@ -1498,7 +1488,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: f"Value {h_variance[0]} was the most repeated value with " f"{h_variance[1]} ({h_variance[1] / len(X):.1f}%) occurrences.", 2, ) - X = X.drop(columns=fx) + Xt = Xt.drop(columns=fx) # Remove features with too low variance for fx, l_variance in self._low_variance.items(): @@ -1506,7 +1496,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: f" --> Feature {fx} was removed due to low variance. Value " f"{l_variance[0]} repeated in {l_variance[1]:.1f}% of the rows.", 2, ) - X = X.drop(columns=fx) + Xt = Xt.drop(columns=fx) # Remove features with too high correlation for col in self.collinear_["drop"]: @@ -1514,11 +1504,11 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: f" --> Feature {col} was removed due to " "collinearity with another feature.", 2, ) - X = X.drop(columns=col) + Xt = Xt.drop(columns=col) # Perform selection based on strategy if self.strategy is None: - return X + return self._convert(Xt) elif self.strategy == "univariate": self._log( @@ -1532,16 +1522,16 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: f"(score: {self.univariate_.scores_[n]:.2f} " f"p-value: {self.univariate_.pvalues_[n]:.2f}).", 2, ) - X = X.drop(columns=column) + Xt = Xt.drop(columns=column) elif self.strategy == "pca": self._log(" --> Applying Principal Component Analysis...", 2) if self.scaler_: self._log(" --> Scaling features...", 2) - X = self.scaler_.transform(X) + Xt = self.scaler_.transform(Xt) - X = self._estimator.transform(X).iloc[:, :self._estimator._comps] + Xt = self._estimator.transform(Xt).iloc[:, :self._estimator._comps] var = np.array(self._estimator.explained_variance_ratio_[:self._n_features]) self._log(f" --> Keeping {self._estimator._comps} components.", 2) @@ -1560,7 +1550,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: ) else: self._log(f" --> Dropping feature {column}.", 2) - X = X.drop(columns=column) + Xt = Xt.drop(columns=column) else: # Advanced strategies self._log( @@ -1571,6 +1561,6 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: for column in X: if column not in self._estimator.best_feature_list: self._log(f" --> Dropping feature {column}.", 2) - X = X.drop(columns=column) + Xt = Xt.drop(columns=column) - return X + return self._convert(Xt) diff --git a/atom/models/classreg.py b/atom/models/classreg.py index df4129ef1..41250d822 100644 --- a/atom/models/classreg.py +++ b/atom/models/classreg.py @@ -8,6 +8,7 @@ from __future__ import annotations from typing import Any, ClassVar, cast + import numpy as np import pandas as pd from optuna.distributions import BaseDistribution diff --git a/atom/nlp.py b/atom/nlp.py index 103c548d9..6c855d343 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -11,34 +11,29 @@ import unicodedata from string import punctuation -import nltk import numpy as np import pandas as pd from beartype import beartype -from nltk.collocations import ( - BigramCollocationFinder, QuadgramCollocationFinder, - TrigramCollocationFinder, -) -from nltk.corpus import wordnet -from nltk.stem import SnowballStemmer, WordNetLemmatizer +from polars.dependencies import _lazy_import from sklearn.base import OneToOneFeatureMixin -from sklearn.utils._set_output import _SetOutputMixin from sklearn.utils.validation import _check_feature_names_in from typing_extensions import Self from atom.data_cleaning import TransformerMixin from atom.utils.types import ( - Bool, Engine, FloatLargerZero, Sequence, Pandas, - VectorizerStarts, Verbose, bool_t, + Bool, Engine, FloatLargerZero, Sequence, VectorizerStarts, Verbose, + XConstructor, YConstructor, bool_t, ) from atom.utils.utils import ( - check_is_fitted, check_nltk_module, composed, crash, get_corpus, is_sparse, - merge, method_to_log, to_df, + check_is_fitted, check_nltk_module, get_corpus, is_sparse, merge, to_df, ) +nltk, _ = _lazy_import("nltk") + + @beartype -class TextCleaner(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class TextCleaner(TransformerMixin, OneToOneFeatureMixin): r"""Applies standard text cleaning to the corpus. Transformations include normalizing characters and dropping @@ -193,7 +188,7 @@ def __init__( self.regex_number = regex_number self.drop_punctuation = drop_punctuation - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Apply the transformations to the data. Parameters @@ -203,7 +198,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -244,28 +239,29 @@ def drop_regex(regex: str): Regex pattern to replace. """ - if isinstance(X[corpus].iloc[0], str): - X[corpus] = X[corpus].str.replace(regex, "", regex=True) + if isinstance(Xt[corpus].iloc[0], str): + Xt[corpus] = Xt[corpus].str.replace(regex, "", regex=True) else: - X[corpus] = X[corpus].apply(lambda x: [re.sub(regex, "", w) for w in x]) + Xt[corpus] = Xt[corpus].apply(lambda x: [re.sub(regex, "", w) for w in x]) - corpus = get_corpus(X) + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + corpus = get_corpus(Xt) self._log("Cleaning the corpus...", 1) if self.decode: - if isinstance(X[corpus].iloc[0], str): - X[corpus] = X[corpus].apply(lambda x: to_ascii(x)) + if isinstance(Xt[corpus].iloc[0], str): + Xt[corpus] = Xt[corpus].apply(lambda x: to_ascii(x)) else: - X[corpus] = X[corpus].apply(lambda doc: [to_ascii(str(w)) for w in doc]) + Xt[corpus] = Xt[corpus].apply(lambda doc: [to_ascii(str(w)) for w in doc]) self._log(" --> Decoding unicode characters to ascii.", 2) if self.lower_case: self._log(" --> Converting text to lower case.", 2) - if isinstance(X[corpus].iloc[0], str): - X[corpus] = X[corpus].str.lower() + if isinstance(Xt[corpus].iloc[0], str): + Xt[corpus] = Xt[corpus].str.lower() else: - X[corpus] = X[corpus].apply(lambda doc: [str(w).lower() for w in doc]) + Xt[corpus] = Xt[corpus].apply(lambda doc: [str(w).lower() for w in doc]) if self.drop_email: if not self.regex_email: @@ -305,21 +301,21 @@ def drop_regex(regex: str): if self.drop_punctuation: self._log(" --> Dropping punctuation from the text.", 2) trans_table = str.maketrans("", "", punctuation) # Translation table - if isinstance(X[corpus].iloc[0], str): + if isinstance(Xt[corpus].iloc[0], str): func = lambda doc: doc.translate(trans_table) else: func = lambda doc: [str(w).translate(trans_table) for w in doc] - X[corpus] = X[corpus].apply(func) + Xt[corpus] = Xt[corpus].apply(func) # Drop empty tokens from every document - if not isinstance(X[corpus].iloc[0], str): - X[corpus] = X[corpus].apply(lambda doc: [w for w in doc if w]) + if not isinstance(Xt[corpus].iloc[0], str): + Xt[corpus] = Xt[corpus].apply(lambda doc: [w for w in doc if w]) - return X + return self._convert(Xt) @beartype -class TextNormalizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class TextNormalizer(TransformerMixin, OneToOneFeatureMixin): """Normalize the corpus. Convert words to a more uniform standard. The transformations @@ -443,7 +439,7 @@ def __init__( self.stem = stem self.lemmatize = lemmatize - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Normalize the text. Parameters @@ -453,7 +449,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -463,7 +459,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: """ - def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: + def pos(tag: str) -> nltk.corpus.wordnet: """Get part of speech from a tag. Parameters @@ -478,21 +474,22 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: """ if tag in ("JJ", "JJR", "JJS"): - return wordnet.ADJ + return nltk.corpus.wordnet.ADJ elif tag in ("RB", "RBR", "RBS"): - return wordnet.ADV + return nltk.corpus.wordnet.ADV elif tag in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): - return wordnet.VERB + return nltk.corpus.wordnet.VERB else: # "NN", "NNS", "NNP", "NNPS" - return wordnet.NOUN + return nltk.corpus.wordnet.NOUN - corpus = get_corpus(X) + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + corpus = get_corpus(Xt) self._log("Normalizing the corpus...", 1) # If the corpus is not tokenized, separate by space - if isinstance(X[corpus].iloc[0], str): - X[corpus] = X[corpus].apply(lambda row: row.split()) + if isinstance(Xt[corpus].iloc[0], str): + Xt[corpus] = Xt[corpus].apply(lambda row: row.split()) stopwords = set() if self.stopwords: @@ -510,15 +507,15 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: if stopwords: self._log(" --> Dropping stopwords.", 2) f = lambda row: [word for word in row if word not in stopwords] - X[corpus] = X[corpus].apply(f) + Xt[corpus] = Xt[corpus].apply(f) if self.stem: if isinstance(self.stem, bool_t): self.stem = "english" self._log(" --> Applying stemming.", 2) - ss = SnowballStemmer(language=self.stem.lower()) - X[corpus] = X[corpus].apply(lambda row: [ss.stem(word) for word in row]) + ss = nltk.stem.SnowballStemmer(language=self.stem.lower()) + Xt[corpus] = Xt[corpus].apply(lambda row: [ss.stem(word) for word in row]) if self.lemmatize: self._log(" --> Applying lemmatization.", 2) @@ -526,15 +523,15 @@ def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: check_nltk_module("taggers/averaged_perceptron_tagger", quiet=self.verbose < 2) check_nltk_module("corpora/omw-1.4", quiet=self.verbose < 2) - wnl = WordNetLemmatizer() + wnl = nltk.stem.WordNetLemmatizer() f = lambda row: [wnl.lemmatize(w, pos(tag)) for w, tag in nltk.pos_tag(row)] - X[corpus] = X[corpus].apply(f) + Xt[corpus] = Xt[corpus].apply(f) - return X + return self._convert(Xt) @beartype -class Tokenizer(TransformerMixin, OneToOneFeatureMixin, _SetOutputMixin): +class Tokenizer(TransformerMixin, OneToOneFeatureMixin): """Tokenize the corpus. Convert documents into sequences of words. Additionally, @@ -662,7 +659,7 @@ def __init__( self.trigram_freq = trigram_freq self.quadgram_freq = quadgram_freq - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Tokenize the text. Parameters @@ -672,7 +669,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -709,24 +706,25 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: return row_c[2:-2].split(sep) - corpus = get_corpus(X) + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + corpus = get_corpus(Xt) self._log("Tokenizing the corpus...", 1) - if isinstance(X[corpus].iloc[0], str): + if isinstance(Xt[corpus].iloc[0], str): check_nltk_module("tokenizers/punkt", quiet=self.verbose < 2) - X[corpus] = X[corpus].apply(lambda row: nltk.word_tokenize(row)) + Xt[corpus] = Xt[corpus].apply(lambda row: nltk.word_tokenize(row)) ngrams = { - "bigrams": BigramCollocationFinder, - "trigrams": TrigramCollocationFinder, - "quadgrams": QuadgramCollocationFinder, + "bigrams": nltk.collocations.BigramCollocationFinder, + "trigrams": nltk.collocations.TrigramCollocationFinder, + "quadgrams": nltk.collocations.QuadgramCollocationFinder, } for attr, finder in ngrams.items(): if frequency := getattr(self, f"{attr[:-1]}_freq"): # Search for all n-grams in the corpus - ngram_fd = finder.from_documents(X[corpus]).ngram_fd + ngram_fd = finder.from_documents(Xt[corpus]).ngram_fd if frequency < 1: frequency = int(frequency * len(ngram_fd)) @@ -737,7 +735,7 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: if freq >= frequency: occur += 1 counts += freq - X[corpus] = X[corpus].apply(replace_ngrams, args=(ngram,)) + Xt[corpus] = Xt[corpus].apply(replace_ngrams, args=(ngram,)) rows.append({attr[:-1]: "_".join(ngram), "frequency": freq}) if rows: @@ -749,11 +747,11 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: else: self._log(f" --> No {attr} found in the corpus.", 2) - return X + return self._convert(Xt) @beartype -class Vectorizer(TransformerMixin, _SetOutputMixin): +class Vectorizer(TransformerMixin): """Vectorize text data. Transform the corpus into meaningful vectors of numbers. The @@ -789,24 +787,12 @@ class Vectorizer(TransformerMixin, _SetOutputMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: str, dict or None, default=None - Execution engine to use for [data][data-acceleration] and - [estimators][estimator-acceleration]. The value should be - one of the possible values to change one of the two engines, - or a dictionary with keys `data` and `estimator`, with their - corresponding choice as values to change both engines. If - None, the default values are used. Choose from: - - - "data": - - - "pandas" (default) - - "pyarrow" - - "modin" + engine: str or None, default=None + Execution engine to use for [estimators][estimator-acceleration]. + If None, the default value is used. Choose from: - - "estimator": - - - "sklearn" (default) - - "cuml" + - "sklearn" (default) + - "cuml" verbose: int, default=0 Verbosity level of the class. Choose from: @@ -920,7 +906,7 @@ def _get_corpus_columns(self) -> list[str]: "The get_feature_names_out method is not available for strategy='hashing'." ) - def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: + def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """Fit to data. Parameters @@ -930,7 +916,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -939,11 +925,15 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: Estimator instance. """ - self._corpus = get_corpus(X) + Xt = to_df(X) + self._corpus = get_corpus(Xt) + + self._check_feature_names(Xt, reset=True) + self._check_n_features(Xt, reset=True) # Convert a sequence of tokens to space separated string - if not isinstance(X[self._corpus].iloc[0], str): - X[self._corpus] = X[self._corpus].apply(lambda row: " ".join(row)) + if not isinstance(Xt[self._corpus].iloc[0], str): + Xt[self._corpus] = Xt[self._corpus].apply(lambda row: " ".join(row)) strategies = { "bow": "CountVectorizer", @@ -962,7 +952,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: self._estimator.set_output(transform="default") self._log("Fitting Vectorizer...", 1) - self._estimator.fit(X[self._corpus]) + self._estimator.fit(Xt[self._corpus]) # Add the estimator as attribute to the instance setattr(self, f"{self.strategy}_", self._estimator) @@ -990,7 +980,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> og_columns = [c for c in self.feature_names_in_ if c != self._corpus] return np.array(og_columns + self._get_corpus_columns()) - def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: """Vectorize the text. Parameters @@ -1000,7 +990,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: not a dataframe, it should be composed of a single feature containing the text documents. - y: int, str, sequence, dataframe-like or None, default=None + y: sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. Returns @@ -1009,14 +999,18 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: Transformed corpus. """ + check_is_fitted(self) + + Xt = to_df(X, columns=self.feature_names_in_) + self._log("Vectorizing the corpus...", 1) # Convert a sequence of tokens to space-separated string - if not isinstance(X[self._corpus].iloc[0], str): - X[self._corpus] = X[self._corpus].apply(lambda row: " ".join(row)) + if not isinstance(Xt[self._corpus].iloc[0], str): + Xt[self._corpus] = Xt[self._corpus].apply(lambda row: " ".join(row)) - matrix = self._estimator.transform(X[self._corpus]) - X = X.drop(columns=self._corpus) # Drop original corpus column + matrix = self._estimator.transform(Xt[self._corpus]) + Xt = Xt.drop(columns=self._corpus) # Drop original corpus column if "sklearn" not in self._estimator.__class__.__module__: matrix = matrix.get() # Convert cupy sparse array back to scipy @@ -1024,7 +1018,7 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: if not self.return_sparse: self._log(" --> Converting the output to a full array.", 2) matrix = matrix.toarray() - elif not X.empty and not is_sparse(X): + elif not Xt.empty and not is_sparse(X): # Raise if there are other columns that are non-sparse raise ValueError( "Invalid value for the return_sparse parameter. The value must " @@ -1037,4 +1031,4 @@ def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame: # Hashing has no words to put as column names columns = [f"hash{i}" for i in range(1, matrix.shape[1] + 1)] - return merge(X, to_df(matrix, index=X.index, columns=columns)) + return self._convert(merge(Xt, to_df(matrix, index=Xt.index, columns=columns))) diff --git a/atom/pipeline.py b/atom/pipeline.py index 7d0ac0b69..156bcd45d 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -12,6 +12,7 @@ from typing import Any, Literal import numpy as np +import pandas as pd from joblib import Memory from sklearn.base import clone from sklearn.pipeline import Pipeline as SkPipeline @@ -26,8 +27,8 @@ from typing_extensions import Self from atom.utils.types import ( - Bool, Estimator, FHConstructor, Float, Scalar, Sequence, - Pandas, Verbose, XConstructor, YConstructor, EngineDataOptions + Bool, EngineDataOptions, EngineTuple, Estimator, FHConstructor, Float, + Pandas, Scalar, Sequence, Verbose, XConstructor, YConstructor, ) from atom.utils.utils import ( NotFittedError, adjust_verbosity, check_is_fitted, fit_one, @@ -926,7 +927,7 @@ def predict_var( X: XConstructor | None = None, *, cov: Bool = False, - ) -> DataFrame: + ) -> pd.DataFrame: """Transform, then predict_var of the final estimator. Parameters diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py index 2e87e07c4..f8e2ceae5 100644 --- a/atom/plots/dataplot.py +++ b/atom/plots/dataplot.py @@ -30,8 +30,8 @@ from atom.plots.baseplot import BasePlot from atom.utils.constants import PALETTE from atom.utils.types import ( - Bool, ColumnSelector, Int, IntLargerZero, Legend, PACFMethods, - RowSelector, Segment, Sequence, TargetSelector, + Bool, ColumnSelector, Int, IntLargerZero, Legend, PACFMethods, RowSelector, + Segment, Sequence, TargetSelector, ) from atom.utils.utils import ( check_dependency, crash, divide, get_corpus, has_task, lst, @@ -1223,7 +1223,7 @@ def plot_ngrams( """ - def get_text(column: Series) -> Series: + def get_text(column: pd.Series) -> pd.Series: """Get the complete corpus as sequence of tokens. Parameters diff --git a/atom/utils/patches.py b/atom/utils/patches.py index ac770d9dc..bc7b2bc2b 100644 --- a/atom/utils/patches.py +++ b/atom/utils/patches.py @@ -9,7 +9,6 @@ from collections.abc import Callable from copy import deepcopy -from functools import wraps from typing import Any from unittest.mock import patch @@ -22,7 +21,6 @@ from sklearn.ensemble._base import _fit_single_estimator from sklearn.model_selection._validation import _fit_and_score, _score from sklearn.utils import Bunch -from sklearn.utils._set_output import _wrap_method_output from sklearn.utils.multiclass import check_classification_targets from sktime.forecasting.compose import EnsembleForecaster as EF from sktime.forecasting.compose import StackingForecaster as SF diff --git a/atom/utils/types.py b/atom/utils/types.py index 5a44217b0..bfb01c9a3 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -13,11 +13,8 @@ TypeAlias, TypedDict, TypeVar, overload, runtime_checkable, ) -import modin.pandas as md import numpy as np import pandas as pd -import polars as pl -import pyarrow as pa import scipy.sparse as sps from beartype.door import is_bearable from beartype.typing import Protocol @@ -187,8 +184,8 @@ class Model(Protocol): """Protocol for all models.""" _goal: Goal - # _metric: ClassMap - # _ht: dict[str, Any] + _metric: ClassMap + _ht: dict[str, Any] def predict(self, *args, **kwargs) -> Pandas: ... @@ -224,7 +221,7 @@ def predict(self, *args, **kwargs) -> Pandas: ... | pd.DataFrame ) XSelector: TypeAlias = XConstructor | Callable[..., XConstructor] -YConstructor: TypeAlias = dict[str, Any] | Sequence[Any] | XConstructor +YConstructor: TypeAlias = Sequence[Any] | XConstructor YSelector: TypeAlias = Int | str | YConstructor FHConstructor: TypeAlias = Int | Sequence[Int] | ForecastingHorizon diff --git a/atom/utils/utils.py b/atom/utils/utils.py index f9c6be978..ab42d7900 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -24,15 +24,11 @@ from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload import mlflow -import modin.pandas as md import nltk import numpy as np import pandas as pd import plotly.graph_objects as go -import polars as pl -import pyarrow as pa import scipy.sparse as sps -from beartype import beartype from beartype.door import is_bearable from IPython.display import display from matplotlib.colors import to_rgba @@ -52,12 +48,12 @@ from sklearn.utils import _print_elapsed_time from sklearn.utils.validation import _is_fitted -from atom.utils.constants import __version__ +from atom.utils.constants import CAT_TYPES, __version__ from atom.utils.types import ( Bool, Estimator, FeatureNamesOut, Float, IndexSelector, Int, IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar, - Scorer, Segment, Sequence, SPTuple, Pandas, Transformer, TReturn, - TReturns, Verbose, XConstructor, XSelector, YConstructor, YSelector, int_t, + Scorer, Segment, Sequence, SPTuple, Transformer, TReturn, TReturns, + Verbose, XConstructor, XSelector, YConstructor, YSelector, int_t, segment_t, sequence_t, ) @@ -253,12 +249,12 @@ class DataConfig: test_size: Scalar = 0.2 holdout_size: Scalar | None = None - def get_stratify_columns(self, df: DataFrame, y: Pandas) -> DataFrame | None: + def get_stratify_columns(self, df: pd.DataFrame, y: Pandas) -> pd.DataFrame | None: """Get columns to stratify by. Parameters ---------- - df: dataframe + df: pd.DataFrame Dataset from which to get the columns. y: series or dataframe @@ -940,7 +936,7 @@ def explainer(self) -> Explainer: def get_explanation( self, - df: DataFrame, + df: pd.DataFrame, target: tuple[Int, ...], ) -> Explanation: """Get an Explanation object. @@ -949,7 +945,7 @@ def get_explanation( Parameters ---------- - df: dataframe + df: pd.DataFrame Data set to look at (subset of the complete dataset). target: tuple @@ -1309,7 +1305,7 @@ def sign(obj: Callable) -> MappingProxyType: return signature(obj).parameters -def merge(*args) -> DataFrame: +def merge(*args) -> pd.DataFrame: """Concatenate pandas objects column-wise. None and empty objects are ignored. @@ -1401,7 +1397,7 @@ def n_cols(obj: XSelector | YSelector) -> int: if hasattr(obj, "shape"): return obj.shape[1] if len(obj.shape) > 1 else 1 elif isinstance(obj, dict): - return len(obj) + return 2 # Dict always goes to dataframe try: if (array := np.asarray(obj)).ndim > 1: @@ -1729,7 +1725,7 @@ def get_versions(models: ClassMap) -> dict[str, str]: return versions -def get_corpus(df: DataFrame) -> str: +def get_corpus(df: pd.DataFrame) -> str: """Get text column from a dataframe. The text column should be called `corpus` (case-insensitive). Also @@ -1737,7 +1733,7 @@ def get_corpus(df: DataFrame) -> str: Parameters ---------- - df: dataframe + df: pd.DataFrame Data set from which to get the corpus. Returns @@ -1844,15 +1840,27 @@ def to_df( else: data_c = data - if data_c is not None and columns is not None: - # Reorder columns to the provided order - try: - data_c = data_c[list(columns)] # Force order determined by columns - except KeyError: - raise ValueError( - f"The columns are different than seen at fit time. Features " - f"{set(data_c.columns) - set(columns)} are missing in X." - ) from None + if data_c is not None: + # If text dataset, change the name of the column to corpus + if list(data_c.columns) == ["x0"] and data_c.dtypes[0].name in CAT_TYPES: + data_c = data_c.rename(columns={data_c.columns[0]: "corpus"}) + else: + # Convert all column names to str + data_c.columns = data_c.columns.astype(str) + + # No duplicate rows nor column names are allowed + if data_c.columns.duplicated().any(): + raise ValueError("Duplicate column names found in X.") + + if columns is not None: + # Reorder columns to the provided order + try: + data_c = data_c[list(columns)] # Force order determined by columns + except KeyError: + raise ValueError( + f"The columns are different than seen at fit time. Features " + f"{set(data_c.columns) - set(columns)} are missing in X." + ) from None return data_c @@ -2115,7 +2123,7 @@ def get_custom_scorer(metric: str | MetricFunction | Scorer) -> Scorer: def name_cols( array: TReturn, - original_df: DataFrame, + original_df: pd.DataFrame, col_names: list[str], ) -> list[str]: """Get the column names after a transformation. @@ -2129,7 +2137,7 @@ def name_cols( array: np.ndarray, sps.matrix, series or dataframe Transformed dataset. - original_df: dataframe + original_df: pd.DataFrame Original dataset. col_names: list of str @@ -2219,10 +2227,10 @@ def get_col_order( def reorder_cols( transformer: Transformer, - df: DataFrame, - original_df: DataFrame, + df: pd.DataFrame, + original_df: pd.DataFrame, col_names: list[str], -) -> DataFrame: +) -> pd.DataFrame: """Reorder the columns to their original order. This function is necessary in case only a subset of the @@ -2234,10 +2242,10 @@ def reorder_cols( transformer: Transformer Instance that transformed `df`. - df: dataframe + df: pd.DataFrame Dataset to reorder. - original_df: dataframe + original_df: pd.DataFrame Original dataset (states the order). col_names: list of str @@ -2316,8 +2324,8 @@ def fit_one( Fitted estimator. """ - Xt = to_df(X, index=getattr(y, "index", None)) - yt = to_tabular(y, index=getattr(Xt, "index", None)) + Xt = to_df(X) + yt = to_tabular(y, index=Xt.index) with _print_elapsed_time("Pipeline", message): if hasattr(estimator, "fit"): @@ -2431,8 +2439,8 @@ def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: else: return out - Xt = to_df(X, index=getattr(y, "index", None)) - yt = to_tabular(y, index=getattr(Xt, "index", None)) + Xt = to_df(X) + yt = to_tabular(y, index=Xt.index) use_y = True @@ -2464,10 +2472,7 @@ def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: # Transform can return X, y or both if isinstance(out, tuple): X_new = prepare_df(out[0], Xt) - y_new = to_tabular( - data=out[1], - index=yt.index, - ) + y_new = to_tabular(out[1], index=X_new.index) if isinstance(yt, pd.DataFrame): y_new = prepare_df(y_new, yt) elif "X" in params and Xt is not None and any(c in Xt for c in inc): @@ -2475,10 +2480,7 @@ def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: X_new = prepare_df(out, Xt) y_new = yt if yt is None else yt.set_axis(X_new.index, axis=0) elif y is not None: - y_new = to_tabular( - data=out, - index=yt.index, - ) + y_new = to_tabular(out) X_new = Xt if Xt is None else Xt.set_index(y_new.index) if isinstance(yt, pd.DataFrame): y_new = prepare_df(y_new, yt) @@ -2665,63 +2667,6 @@ def wrapper(*args, **kwargs) -> Any: return wrapper - -def wrap_transformer_methods(f: Callable) -> Callable: - """Wrap transformer methods with shared code. - - The following operations are always performed: - - - Transform the input to pandas types. - - Add the `feature_names_in_` and `n_features_in_` attributes. - - Check if the instance is fitted before transforming. - - """ - - @wraps(f) - @beartype - def wrapper( - self: T_Transformer, - X: XSelector | None = None, - y: YSelector | None = None, - **kwargs, - ) -> T_Transformer | Pandas | tuple[pd.DataFrame, Pandas]: - if f.__name__ == "fit": - Xt = to_df(X, index=getattr(y, "index", None)) - yt = to_tabular(y, index=getattr(Xt, "index", None)) - - self._check_feature_names(Xt, reset=True) - self._check_n_features(Xt, reset=True) - - return f(self, Xt, yt, **kwargs) - - else: - if "TransformerMixin" not in str(self.fit): - check_is_fitted(self) - - Xt = to_df( - data=X, - index=getattr(y, "index", None), - columns=getattr(self, "feature_names_in_", None), - ) - yt = to_tabular( - y, - index=getattr(Xt, "index", None), - columns=getattr(self, "target_names_in_", None), - ) - - if "y" in sign(f): - out = f(self, Xt, yt, **kwargs) - else: - out = f(self, Xt, **kwargs) - - if isinstance(out, tuple): - return tuple(self._convert(x) for x in out) - else: - return self._convert(out) - - return wrapper - - def make_sklearn( obj: T_Estimator, feature_names_out: FeatureNamesOut = "one-to-one", diff --git a/tests/conftest.py b/tests/conftest.py index 673194cc9..661cd51e5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,9 +11,9 @@ import numpy as np import pandas as pd -from ray.util.joblib import register_ray import pyarrow as pa import pytest +from ray.util.joblib import register_ray from sklearn.base import BaseEstimator from sklearn.datasets import ( load_breast_cancer, load_diabetes, load_wine, @@ -33,7 +33,7 @@ from _pytest.monkeypatch import MonkeyPatch - from atom.utils.types import DataFrame, Sequence, Pandas, XSelector + from atom.utils.types import DataFrame, Pandas, Sequence, XSelector class DummyTransformer(TransformerMixin, BaseEstimator): @@ -109,14 +109,8 @@ def _mock_mlflow_log_model(mocker): mocker.patch("mlflow.sklearn.log_model") -@pytest.fixture() -def random(): - """Return numpy's default random number generator.""" - return np.random.default_rng() - - -@pytest.fixture() -def ray(): +@pytest.fixture(autouse=True) +def _register_ray(): """Register ray as joblib backend. Although atom does this internally, it's skipped when ray is @@ -127,6 +121,12 @@ def ray(): register_ray() +@pytest.fixture() +def random(): + """Return numpy's default random number generator.""" + return np.random.default_rng() + + def get_train_test( X: XSelector | None, y: Sequence[Any] | pd.DataFrame, diff --git a/tests/test_atom.py b/tests/test_atom.py index dc69b8be6..9c4689d6a 100644 --- a/tests/test_atom.py +++ b/tests/test_atom.py @@ -9,7 +9,6 @@ from unittest.mock import MagicMock, patch import numpy as np -import pyarrow as pa import pandas as pd import pytest from category_encoders.target_encoder import TargetEncoder @@ -33,9 +32,9 @@ from .conftest import ( X10, DummyTransformer, X10_dt, X10_nan, X10_str, X10_str2, X20_out, X_bin, - X_class, X_ex, X_label, X_reg, X_sparse, X_text, y10, y10_label, + X_class, X_ex, X_label, X_pa, X_reg, X_sparse, X_text, y10, y10_label, y10_label2, y10_sn, y10_str, y_bin, y_class, y_ex, y_fc, y_label, - y_multiclass, y_multireg, y_reg, X_pa + y_multiclass, y_multireg, y_reg, ) diff --git a/tests/test_baserunner.py b/tests/test_baserunner.py index 80ca1a240..13b45e18b 100644 --- a/tests/test_baserunner.py +++ b/tests/test_baserunner.py @@ -602,8 +602,8 @@ def test_input_is_3_tuples(): """Assert that the 3 tuples input works.""" X_train = bin_train.iloc[:, :-1] y_train = bin_train.iloc[:, -1] - X_test = bin_test.iloc[100:-20, :-1] - y_test = bin_test.iloc[100:-20, -1] + X_test = bin_test.iloc[:-20, :-1] + y_test = bin_test.iloc[:-20, -1] X_holdout = bin_test.iloc[-20:, :-1] y_holdout = bin_test.iloc[-20:, -1] @@ -622,7 +622,7 @@ def test_input_is_train_test_holdout(): def test_4_data_provided(): - """Assert that the 4 elements input works.""" + """Assert that the four-element input works.""" X_train = bin_train.iloc[:, :-1] X_test = bin_test.iloc[:, :-1] y_train = bin_train.iloc[:, -1] @@ -634,11 +634,11 @@ def test_4_data_provided(): def test_6_data_provided(): - """Assert that the 6 elements input works.""" + """Assert that the six-element input works.""" X_train = bin_train.iloc[:, :-1] y_train = bin_train.iloc[:, -1] - X_test = bin_test.iloc[100:-20, :-1] - y_test = bin_test.iloc[100:-20, -1] + X_test = bin_test.iloc[:-20, :-1] + y_test = bin_test.iloc[:-20, -1] X_holdout = bin_test.iloc[-20:, :-1] y_holdout = bin_test.iloc[-20:, -1] diff --git a/tests/test_basetrainer.py b/tests/test_basetrainer.py index f78e3c8e7..5e3ffd6e4 100644 --- a/tests/test_basetrainer.py +++ b/tests/test_basetrainer.py @@ -14,6 +14,7 @@ from optuna.pruners import MedianPruner from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import f1_score, make_scorer + from atom import ATOMClassifier from atom.training import DirectClassifier, DirectRegressor @@ -376,7 +377,7 @@ def test_errors_keep(): @patch("atom.basetransformer.ray", MagicMock()) @patch("atom.basetrainer.ray", MagicMock()) -def test_parallel_with_ray(ray): +def test_parallel_with_ray(): """Assert that parallel runs successfully with ray backend.""" trainer = DirectClassifier( models=["LR", "LDA"], diff --git a/tests/test_basetransformer.py b/tests/test_basetransformer.py index 6f6095a20..987d23275 100644 --- a/tests/test_basetransformer.py +++ b/tests/test_basetransformer.py @@ -214,7 +214,7 @@ def test_column_order_is_retained(): def test_incorrect_columns(): """Assert that an error is raised when the provided columns do not match.""" - with pytest.raises(ValueError, match=".*features are different.*"): + with pytest.raises(ValueError, match=".*columns are different.*"): BaseTransformer._check_input(X_bin, columns=["1", "2"]) @@ -271,12 +271,6 @@ def test_sparse_matrices_2_tuples(): assert atom[atom.columns[0]].dtype.name == "Sparse[int64, 0]" -def test_target_is_dict(): - """Assert that the target column is assigned correctly for a dict.""" - _, y = BaseTransformer._check_input(X10, {"a": [0] * 10}) - assert isinstance(y, pd.Series) - - def test_multioutput_str(): """Assert that multioutput can be assigned by column name.""" X, y = BaseTransformer._check_input(X_bin, ["mean radius", "worst perimeter"]) diff --git a/tests/test_branch.py b/tests/test_branch.py index b4d44cc53..d168ec396 100644 --- a/tests/test_branch.py +++ b/tests/test_branch.py @@ -7,20 +7,23 @@ import glob import os from pathlib import Path -import polars as pl +from unittest.mock import MagicMock, patch + +import dask.dataframe as dd +import modin.pandas as md import numpy as np import pandas as pd +import polars as pl +import pyarrow as pa import pytest from pandas.testing import assert_frame_equal from sklearn.preprocessing import MinMaxScaler, StandardScaler -import pyarrow as pa -from unittest.mock import patch, MagicMock + from atom import ATOMClassifier, ATOMRegressor from atom.branch import Branch, BranchManager from atom.training import DirectClassifier from atom.utils.utils import merge -import modin.pandas as md -import dask.dataframe as dd + from .conftest import ( X10, X10_str, X_bin, X_bin_array, X_class, X_idx, y10, y10_str, y_bin, y_bin_array, y_idx, y_multiclass, @@ -755,8 +758,6 @@ def test_dask_engine(): @patch.dict("sys.modules", {"pyspark": MagicMock(spec=["__spec__", "sql"])}) def test_pyspark_engine(): """Assert that the pyspark engine returns pyspark types.""" - import sys - print(sys.modules) atom = ATOMClassifier(X_bin, y_bin, engine="pyspark", random_state=1) assert "createDataFrame" in str(atom.X) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index b7733ae3a..e4826620a 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -5,7 +5,6 @@ """ -from unittest.mock import MagicMock, patch import pandas as pd import pytest @@ -191,21 +190,6 @@ def test_hashing(): assert "hash1" in X -@patch.dict( - "sys.modules", - { - "cuml": MagicMock(spec=["__spec__"]), - "cuml.common.device_selection": MagicMock(spec=["set_global_device_type"]), - "cuml.internals.memory_utils": MagicMock(spec=["set_global_output_type"]), - "cuml.feature_extraction.text": MagicMock(), - }, -) -def test_gpu(): - """Assert that the gpu implementation calls the get method of matrix.""" - vectorizer = Vectorizer(device="gpu", engine="cuml") - pytest.raises(ValueError, vectorizer.fit_transform, X_text) - - def test_return_sparse(): """Assert that the output is sparse.""" X = Vectorizer(strategy="bow", return_sparse=True).fit_transform(X_text, y10) From 2e5941545e9ee56ed00c7172ef902aeedd2b159d Mon Sep 17 00:00:00 2001 From: Mavs Date: Sun, 18 Feb 2024 19:19:00 +0100 Subject: [PATCH 05/12] dataengines 5 --- atom/{branch => data}/__init__.py | 0 atom/{branch => data}/branch.py | 0 atom/{branch => data}/branchmanager.py | 0 atom/{branch => data}/dataengines.py | 0 tests/{test_branch.py => test_data.py} | 2 +- 5 files changed, 1 insertion(+), 1 deletion(-) rename atom/{branch => data}/__init__.py (100%) rename atom/{branch => data}/branch.py (100%) rename atom/{branch => data}/branchmanager.py (100%) rename atom/{branch => data}/dataengines.py (100%) rename tests/{test_branch.py => test_data.py} (99%) diff --git a/atom/branch/__init__.py b/atom/data/__init__.py similarity index 100% rename from atom/branch/__init__.py rename to atom/data/__init__.py diff --git a/atom/branch/branch.py b/atom/data/branch.py similarity index 100% rename from atom/branch/branch.py rename to atom/data/branch.py diff --git a/atom/branch/branchmanager.py b/atom/data/branchmanager.py similarity index 100% rename from atom/branch/branchmanager.py rename to atom/data/branchmanager.py diff --git a/atom/branch/dataengines.py b/atom/data/dataengines.py similarity index 100% rename from atom/branch/dataengines.py rename to atom/data/dataengines.py diff --git a/tests/test_branch.py b/tests/test_data.py similarity index 99% rename from tests/test_branch.py rename to tests/test_data.py index d168ec396..2cdf2c155 100644 --- a/tests/test_branch.py +++ b/tests/test_data.py @@ -20,7 +20,7 @@ from sklearn.preprocessing import MinMaxScaler, StandardScaler from atom import ATOMClassifier, ATOMRegressor -from atom.branch import Branch, BranchManager +from atom.data import Branch, BranchManager from atom.training import DirectClassifier from atom.utils.utils import merge From 1e99a1c35eb18c994a0faebcad07b5820976218e Mon Sep 17 00:00:00 2001 From: Mavs Date: Sun, 18 Feb 2024 19:19:20 +0100 Subject: [PATCH 06/12] dataengines 6 --- atom/atom.py | 55 +++++--- atom/basemodel.py | 44 +++--- atom/baserunner.py | 4 +- atom/basetrainer.py | 8 +- atom/basetransformer.py | 57 ++++---- atom/data/__init__.py | 5 +- atom/data/branch.py | 49 ++++--- atom/data/branchmanager.py | 2 +- atom/data/dataengines.py | 76 +++++++---- atom/data_cleaning.py | 123 +++++++++-------- atom/feature_engineering.py | 45 +++---- atom/models/classreg.py | 33 ++--- atom/models/ts.py | 9 +- atom/nlp.py | 50 ++++--- atom/pipeline.py | 84 +++++++----- atom/plots/baseplot.py | 14 +- atom/plots/predictionplot.py | 16 ++- atom/utils/types.py | 11 +- atom/utils/utils.py | 243 +++++++++++++++++++--------------- docs_sources/dependencies.md | 23 ++-- pyproject.toml | 27 ++-- tests/test_atom.py | 15 +++ tests/test_basemodel.py | 15 +++ tests/test_baserunner.py | 2 +- tests/test_basetrainer.py | 4 +- tests/test_basetransformer.py | 7 - tests/test_pipeline.py | 13 ++ tests/test_training.py | 2 +- tests/test_utils.py | 12 +- 29 files changed, 593 insertions(+), 455 deletions(-) diff --git a/atom/atom.py b/atom/atom.py index 56d8557df..3aedd7ae4 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -24,13 +24,13 @@ from beartype import beartype from joblib.memory import Memory from pandas._typing import DtypeObj -from polars.dependencies import _lazy_import +from scipy import stats from sklearn.pipeline import Pipeline as SkPipeline from sklearn.utils.metaestimators import available_if from atom.baserunner import BaseRunner from atom.basetransformer import BaseTransformer -from atom.branch import Branch, BranchManager +from atom.data import Branch, BranchManager from atom.data_cleaning import ( Balancer, Cleaner, Decomposer, Discretizer, Encoder, Imputer, Normalizer, Pruner, Scaler, TransformerMixin, @@ -59,18 +59,13 @@ XSelector, YSelector, sequence_t, ) from atom.utils.utils import ( - ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, + ClassMap, DataConfig, DataContainer, Goal, adjust, check_dependency, composed, crash, fit_one, flt, get_cols, get_custom_scorer, has_task, is_sparse, lst, make_sklearn, merge, method_to_log, n_cols, replace_missing, sign, ) -stats, _ = _lazy_import("scipy.stats") -diagnostic, _ = _lazy_import("statsmodels.stats.diagnostic") -stattools, _ = _lazy_import("statsmodels.tsa.stattools") - - T_Transformer = TypeVar("T_Transformer", bound=Transformer) @@ -484,6 +479,9 @@ def checks(self, *, columns: ColumnSelector | None = None) -> pd.DataFrame: - **p_value:** Corresponding p-value. """ + from statsmodels.stats.diagnostic import acorr_ljungbox + from statsmodels.tsa.stattools import adfuller, kpss + columns_c = self.branch._get_columns(columns, only_numerical=True) df = pd.DataFrame( @@ -500,11 +498,12 @@ def checks(self, *, columns: ColumnSelector | None = None) -> pd.DataFrame: for test in ("adf", "kpss", "lb"): if test == "adf": - stat = stattools.adfuller(X, maxlag=None, autolag="AIC") + stat = adfuller(X, maxlag=None, autolag="AIC") elif test == "kpss": - stat = stattools.kpss(X, regression="ct", nlags="auto") # ct is trend stationarity + # regression='ct' is trend stationarity + stat = kpss(X, regression="ct", nlags="auto") elif test == "lb": - l_jung = diagnostic.acorr_ljungbox(X, lags=None, period=lst(self.sp.sp)[0]) + l_jung = acorr_ljungbox(X, lags=None, period=lst(self.sp.sp)[0]) stat = l_jung.loc[l_jung["lb_pvalue"].idxmin()] # Add as column to the dataframe @@ -684,7 +683,7 @@ def inverse_transform( Parameters ---------- - Transformed feature set with shape=(n_samples, n_features). + X: Transformed feature set with shape=(n_samples, n_features). If None, `X` is ignored in the transformers. y: int, str, sequence, dataframe-like or None, default=None @@ -712,8 +711,8 @@ def inverse_transform( """ Xt, yt = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) - with adjust_verbosity(self.pipeline, verbose) as pipeline: - return self._convert(pipeline.inverse_transform(Xt, yt)) + with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl: + return pl.inverse_transform(Xt, yt) @classmethod def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM: @@ -1134,8 +1133,8 @@ def transform( """ Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) - with adjust_verbosity(self.pipeline, verbose) as pipeline: - return self._convert(pipeline.transform(Xt, yt)) + with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl: + return pl.transform(Xt, yt) # Base transformers ============================================ >> @@ -1143,11 +1142,15 @@ def _prepare_kwargs( self, kwargs: dict[str, Any], params: MappingProxyType | None = None, + *, + is_runner: Bool = False, ) -> dict[str, Any]: """Return kwargs with atom's values if not specified. This method is used for all transformers and runners to pass - atom's BaseTransformer's properties to the classes. + atom's BaseTransformer's properties to the classes. The engine + parameter is the only one that is modified for non-runners + since ATOM's transformers only accept the estimator engine. Parameters ---------- @@ -1157,6 +1160,9 @@ def _prepare_kwargs( params: mappingproxy or None, default=None Parameters in the class' signature. + is_runner: bool, default=False + Whether the params are passed to a runner. + Returns ------- dict @@ -1165,7 +1171,12 @@ def _prepare_kwargs( """ for attr in BaseTransformer.attrs: if (not params or attr in params) and attr not in kwargs: - kwargs[attr] = getattr(self, attr) + if attr == "engine" and not is_runner: + # Engine parameter is special since we don't + # want to change data engines in the pipeline + kwargs[attr] = getattr(self, attr).estimator + else: + kwargs[attr] = getattr(self, attr) return kwargs @@ -2215,7 +2226,7 @@ def _run(self, trainer: BaseRunner): Instance that does the actual model training. """ - if any(col.dtype.kind not in "ifu" for col in get_cols(self.y)): + if any(col.dtype.kind not in "ifu" for col in get_cols(self.branch.y)): raise ValueError( "The target column is not numerical. Use atom.clean() " "to encode the target column to numerical values." @@ -2289,7 +2300,7 @@ def run( n_bootstrap=n_bootstrap, parallel=parallel, errors=errors, - **self._prepare_kwargs(kwargs), + **self._prepare_kwargs(kwargs, is_runner=True), ) ) @@ -2351,7 +2362,7 @@ class for a description of the parameters. n_bootstrap=n_bootstrap, parallel=parallel, errors=errors, - **self._prepare_kwargs(kwargs), + **self._prepare_kwargs(kwargs, is_runner=True), ) ) @@ -2411,6 +2422,6 @@ class for a description of the parameters. n_bootstrap=n_bootstrap, parallel=parallel, errors=errors, - **self._prepare_kwargs(kwargs), + **self._prepare_kwargs(kwargs, is_runner=True), ) ) diff --git a/atom/basemodel.py b/atom/basemodel.py index a655e6709..b2ec92108 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -15,15 +15,13 @@ from importlib import import_module from logging import Logger from pathlib import Path -from typing import Any, Literal, overload +from typing import TYPE_CHECKING, Any, Literal, overload from unittest.mock import patch import dill as pickle import mlflow import numpy as np -import optuna import pandas as pd -import ray from beartype import beartype from joblib.memory import Memory from joblib.parallel import Parallel, delayed @@ -37,7 +35,6 @@ from optuna.study import Study from optuna.terminator import report_cross_validation_scores from optuna.trial import FrozenTrial, Trial, TrialState -from ray import serve from sklearn.base import clone from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import roc_curve @@ -57,9 +54,8 @@ from sktime.performance_metrics.forecasting import make_forecasting_scorer from sktime.proba.normal import Normal from sktime.split import ExpandingWindowSplitter, SingleWindowSplitter -from starlette.requests import Request - -from atom.branch import Branch, BranchManager +import optuna +from atom.data import Branch, BranchManager from atom.data_cleaning import Scaler from atom.pipeline import Pipeline from atom.plots import RunnerPlot @@ -74,13 +70,17 @@ ) from atom.utils.utils import ( ClassMap, DataConfig, Goal, PlotCallback, ShapExplanation, Task, - TrialsCallback, adjust_verbosity, cache, check_dependency, check_empty, + TrialsCallback, adjust, cache, check_dependency, check_empty, composed, crash, estimator_has_attr, flt, get_col_names, get_cols, get_custom_scorer, has_task, it, lst, merge, method_to_log, rnd, sign, time_to_str, to_df, to_series, to_tabular, ) +if TYPE_CHECKING: + from starlette.requests import Request + + # Disable optuna info logs (ATOM already displays the same info) optuna.logging.set_verbosity(optuna.logging.WARNING) @@ -129,9 +129,16 @@ class BaseModel(RunnerPlot): - "data": + - "numpy" - "pandas" (default) + - "pandas-pyarrow" + - "polars" + - "polars-lazy" - "pyarrow" - "modin" + - "dask" + - "pyspark" + - "pyspark-pandas" - "estimator": @@ -2279,8 +2286,8 @@ def inverse_transform( """ Xt, yt = self._check_input(X, y, columns=self.branch.features, name=self.branch.target) - with adjust_verbosity(self.pipeline, verbose) as pipeline: - return pipeline.inverse_transform(Xt, yt) + with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl: + return pl.inverse_transform(Xt, yt) @composed(crash, method_to_log, beartype) def register( @@ -2380,8 +2387,11 @@ def serve(self, method: str = "predict", host: str = "127.0.0.1", port: Int = 80 Port for HTTP server. """ + check_dependency("ray") + import ray + from ray.serve import deployment, run - @serve.deployment + @deployment class ServeModel: """Model deployment class. @@ -2420,11 +2430,7 @@ async def __call__(self, request: Request) -> np.ndarray: if not ray.is_initialized(): ray.init(log_to_driver=False) - server = ServeModel.bind( - pipeline=self.export_pipeline(), - method=method, - ) - serve.run(server, host=host, port=port) + run(ServeModel.bind(pipeline=self.export_pipeline(), method=method), host=host, port=port) self._log(f"Serving model {self.fullname} on {host}:{port}...", 1) @@ -2477,8 +2483,8 @@ def transform( """ Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) - with adjust_verbosity(self.pipeline, verbose) as pipeline: - return pipeline.transform(Xt, yt) + with adjust(self.pipeline, transform=self.engine.data, verbose=verbose) as pl: + return pl.transform(Xt, yt) class ClassRegModel: @@ -2646,7 +2652,7 @@ def assign_prediction_columns() -> list[str]: if method != "score": pred = np.array(self.memory.cache(getattr(self.estimator, method))(Xt[self.features])) - if pred.ndim == 1: + if pred.ndim == 1 or pred.shape[1] == 1: data = to_series(pred, index=Xt.index, name=self.target) elif pred.ndim < 3: data = to_df(pred, index=Xt.index, columns=assign_prediction_columns()) diff --git a/atom/baserunner.py b/atom/baserunner.py index f83eed87b..eb8a9a895 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -32,7 +32,7 @@ from atom.basetracker import BaseTracker from atom.basetransformer import BaseTransformer -from atom.branch import Branch +from atom.data import Branch from atom.models import MODELS, Stacking, Voting from atom.pipeline import Pipeline from atom.utils.constants import DF_ATTRS @@ -455,7 +455,7 @@ def _subsample(df: pd.DataFrame) -> pd.DataFrame: return df.iloc[sorted(random.sample(range(len(df)), k=n_rows))] def _set_index( - df: DataFrame, + df: pd.DataFrame, y: Pandas | None, index: IndexSelector | None = None, ) -> pd.DataFrame: diff --git a/atom/basetrainer.py b/atom/basetrainer.py index 1d64279ef..3a74b1c49 100644 --- a/atom/basetrainer.py +++ b/atom/basetrainer.py @@ -12,15 +12,13 @@ from datetime import datetime as dt from typing import Any -import dask import mlflow import numpy as np -import ray from joblib import Parallel, delayed from optuna import Study, create_study from atom.baserunner import BaseRunner -from atom.branch import BranchManager +from atom.data import BranchManager from atom.data_cleaning import BaseTransformer from atom.models import MODELS, CustomModel from atom.plots import RunnerPlot @@ -374,6 +372,8 @@ def execute_model(m: Model) -> Model | None: m.verbose = self.verbose if self.backend == "ray": + import ray + # This implementation is more efficient than through joblib's # ray backend. The difference is that in this one you start N # tasks, and in the other, you start N actors and then have them @@ -381,6 +381,8 @@ def execute_model(m: Model) -> Model | None: execute_remote = ray.remote(num_cpus=self.n_jobs)(execute_model) models = ray.get([execute_remote.remote(m) for m in self._models]) elif self.backend == "dask": + import dask + models = dask.compute(*[dask.delayed(execute_model)(m) for m in self._models]) else: models = Parallel(n_jobs=self.n_jobs)( diff --git a/atom/basetransformer.py b/atom/basetransformer.py index b66e0a203..2586332c7 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -12,41 +12,33 @@ import re import tempfile import warnings -from copy import deepcopy +from collections.abc import Hashable from datetime import datetime as dt from importlib import import_module from importlib.util import find_spec from logging import DEBUG, FileHandler, Formatter, Logger, getLogger from multiprocessing import cpu_count from pathlib import Path -from typing import Any, TypeVar, overload +from typing import Any, Literal, TypeVar, overload import joblib +import mlflow import numpy as np import pandas as pd -import requests from beartype import beartype from joblib.memory import Memory -from polars.dependencies import _lazy_import from sklearn.utils.validation import check_memory from atom.utils.types import ( Backend, Bool, Engine, EngineDataOptions, EngineEstimatorOptions, - EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Sequence, - Severity, Verbose, Warnings, bool_t, int_t, + EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Pandas, + Sequence, Severity, Verbose, Warnings, XSelector, YSelector, bool_t, int_t, ) from atom.utils.utils import ( check_dependency, crash, lst, make_sklearn, to_df, to_tabular, ) -mlflow, _ = _lazy_import("mlflow") -dagshub, _ = _lazy_import("dagshub") -ray, _ = _lazy_import("ray") -ray_joblib, _ = _lazy_import("ray.util.joblib") -dask, _ = _lazy_import("dask") - - T_Estimator = TypeVar("T_Estimator", bound=Estimator) @@ -145,16 +137,11 @@ def engine(self, value: Engine): check_dependency(engine.data_engine.library) if engine.estimator == "sklearnex": - if not find_spec("sklearnex"): - raise ModuleNotFoundError( - "Failed to import scikit-learn-intelex. The library is " - "not installed. Note that the library only supports CPUs " - "with a x86 architecture." - ) - else: - import sklearnex + check_dependency("sklearnex") + import sklearnex + + sklearnex.set_config(self.device.lower() if self._gpu else "auto") - sklearnex.set_config(self.device.lower() if self._gpu else "auto") elif engine.estimator == "cuml": if not find_spec("cuml"): raise ModuleNotFoundError( @@ -182,10 +169,18 @@ def backend(self) -> Backend: @beartype def backend(self, value: Backend): if value == "ray": - ray_joblib.register_ray() # Register ray as joblib backend + check_dependency("ray") + import ray + from ray.util.joblib import register_ray + + register_ray() # Register ray as joblib backend if not ray.is_initialized(): ray.init(log_to_driver=False) + elif value == "dask": + check_dependency("dask") + import dask + try: dask.distributed.Client.current() except ValueError: @@ -302,6 +297,12 @@ def experiment(self, value: str | None): self._experiment = value if value: if value.lower().startswith("dagshub:"): + check_dependency("dagshub") + check_dependency("requests") + import dagshub + import requests + from dagshub.auth.token_auth import HTTPBearerAuth + value = value[8:] # Drop dagshub: token = dagshub.auth.get_token() @@ -311,7 +312,7 @@ def experiment(self, value: str | None): # Fetch username from dagshub api username = requests.get( url="https://dagshub.com/api/v1/user", - auth=dagshub.auth.token_auth.HTTPBearerAuth(token), + auth=HTTPBearerAuth(token), timeout=5, ).json()["username"] @@ -442,7 +443,7 @@ def _check_input( if X is None and y is None: raise ValueError("X and y can't be both None!") else: - Xt = to_df(deepcopy(X() if callable(X) else X), columns=columns) + Xt = to_df(X() if callable(X) else X, columns=columns) # Prepare target column if not isinstance(y, Int | str | None): @@ -471,7 +472,7 @@ def _check_input( f" got len(X)={len(Xt)} and len(y)={len(y)}." ) from None else: - yt = to_tabular(deepcopy(y), index=getattr(Xt, "index", None), columns=name) + yt = to_tabular(y, index=getattr(Xt, "index", None), columns=name) # Check X and y have the same indices if Xt is not None and not Xt.index.equals(yt.index): @@ -514,8 +515,8 @@ def _convert(self, obj: Any) -> Any: """ # Only apply transformations when the engine is defined - if hasattr(self, "engine") and isinstance(obj, pd.Series | pd.DataFrame): - return self.engine.data_engine.convert(obj) + if hasattr(self, "_engine") and isinstance(obj, pd.Series | pd.DataFrame): + return self._engine.data_engine.convert(obj) else: return obj diff --git a/atom/data/__init__.py b/atom/data/__init__.py index dd6f3adc1..236e72416 100644 --- a/atom/data/__init__.py +++ b/atom/data/__init__.py @@ -5,5 +5,6 @@ """ -from atom.branch.branch import Branch -from atom.branch.branchmanager import BranchManager +from atom.data.branch import Branch +from atom.data.branchmanager import BranchManager +from atom.data.dataengines import DATA_ENGINES diff --git a/atom/data/branch.py b/atom/data/branch.py index f37828a97..fd5710ab4 100644 --- a/atom/data/branch.py +++ b/atom/data/branch.py @@ -14,18 +14,18 @@ from typing import Literal, overload from warnings import filterwarnings +import dill as pickle import pandas as pd from beartype import beartype from beartype.roar import BeartypeDecorHintPep585DeprecationWarning from joblib.memory import Memory -from polars.dependencies import _lazy_import from sklearn.utils.validation import check_memory from atom.pipeline import Pipeline from atom.utils.types import ( Bool, ColumnSelector, Int, IntLargerEqualZero, Pandas, RowSelector, Scalar, - Sequence, TargetSelector, TargetsSelector, XConstructor, XSelector, - YSelector, int_t, segment_t, + TargetSelector, TargetsSelector, XConstructor, XDatasets, YConstructor, + YDatasets, int_t, segment_t, ) from atom.utils.utils import ( DataContainer, check_scaling, flt, get_col_names, get_cols, lst, merge, @@ -33,9 +33,6 @@ ) -pickle, _ = _lazy_import("dill") - - filterwarnings("ignore", category=BeartypeDecorHintPep585DeprecationWarning) @@ -103,7 +100,7 @@ class Branch: """ - _shared_attrs = [ + _shared_attrs = ( "pipeline", "mapping", "dataset", @@ -121,7 +118,7 @@ class Branch: "features", "n_features", "target", - ] + ) def __init__( self, @@ -187,11 +184,13 @@ def name(self, value: str): # Data properties ============================================== >> - def _check_setter( - self, - name: str, - value: Sequence[Scalar | str] | XConstructor, - ) -> Pandas: + @overload + def _check_setter(self, name: XDatasets, value: YConstructor) -> pd.DataFrame: ... + + @overload + def _check_setter(self, name: YDatasets, value: YConstructor) -> pd.Series: ... + + def _check_setter(self, name: XDatasets | YDatasets, value: YConstructor) -> Pandas: """Check the data set's setter property. Convert the property to a 'pandas' object and compare with the @@ -325,7 +324,7 @@ def dataset(self) -> pd.DataFrame: return self._data.data @dataset.setter - def dataset(self, value: XSelector): + def dataset(self, value: XConstructor): self._data.data = self._check_setter("dataset", value) @property @@ -334,7 +333,7 @@ def train(self) -> pd.DataFrame: return self._data.data.loc[self._data.train_idx] @train.setter - def train(self, value: XSelector): + def train(self, value: XConstructor): df = self._check_setter("train", value) self._data.data = pd.concat([df, self.test]) self._data.train_idx = df.index @@ -345,7 +344,7 @@ def test(self) -> pd.DataFrame: return self._data.data.loc[self._data.test_idx] @test.setter - def test(self, value: XSelector): + def test(self, value: XConstructor): df = self._check_setter("test", value) self._data.data = pd.concat([self.train, df]) self._data.test_idx = df.index @@ -369,7 +368,7 @@ def X(self) -> pd.DataFrame: return self._data.data[self.features] @X.setter - def X(self, value: XSelector): + def X(self, value: XConstructor): df = self._check_setter("X", value) self._data.data = merge(df, self.y) @@ -379,7 +378,7 @@ def y(self) -> Pandas: return self._data.data[self.target] @y.setter - def y(self, value: YSelector): + def y(self, value: YConstructor): series = self._check_setter("y", value) self._data.data = merge(self.X, series) @@ -389,7 +388,7 @@ def X_train(self) -> pd.DataFrame: return self.train[self.features] @X_train.setter - def X_train(self, value: XSelector): + def X_train(self, value: XConstructor): df = self._check_setter("X_train", value) self._data.data = pd.concat([merge(df, self.y_train), self.test]) @@ -399,7 +398,7 @@ def y_train(self) -> Pandas: return self.train[self.target] @y_train.setter - def y_train(self, value: YSelector): + def y_train(self, value: YConstructor): series = self._check_setter("y_train", value) self._data.data = pd.concat([merge(self.X_train, series), self.test]) @@ -409,7 +408,7 @@ def X_test(self) -> pd.DataFrame: return self.test[self.features] @X_test.setter - def X_test(self, value: XSelector): + def X_test(self, value: XConstructor): df = self._check_setter("X_test", value) self._data.data = pd.concat([self.train, merge(df, self.y_test)]) @@ -419,7 +418,7 @@ def y_test(self) -> Pandas: return self.test[self.target] @y_test.setter - def y_test(self, value: YSelector): + def y_test(self, value: YConstructor): series = self._check_setter("y_test", value) self._data.data = pd.concat([self.train, merge(self.X_test, series)]) @@ -461,7 +460,7 @@ def _all(self) -> pd.DataFrame: calculation. """ - return pd.concat([self.dataset, self.holdout]) + return pd.concat([self.dataset, self.holdout]) # type: ignore[list-item] # Utility methods ============================================== >> @@ -475,7 +474,7 @@ def _get_shared_attrs(self) -> list[str]: """ instance_vars = [x for x in vars(self) if not x.startswith("_") and x.endswith("_")] - return self._shared_attrs + instance_vars + return list(self._shared_attrs) + instance_vars @overload def _get_rows( @@ -483,7 +482,7 @@ def _get_rows( rows: RowSelector, *, return_X_y: Literal[False] = ..., - ) -> DataFrame: ... + ) -> pd.DataFrame: ... @overload def _get_rows( diff --git a/atom/data/branchmanager.py b/atom/data/branchmanager.py index 6e2c41fe8..66f7ed3de 100644 --- a/atom/data/branchmanager.py +++ b/atom/data/branchmanager.py @@ -16,7 +16,7 @@ from joblib.memory import Memory from sklearn.utils.validation import check_memory -from atom.branch.branch import Branch +from atom.data.branch import Branch from atom.utils.types import Bool, Int from atom.utils.utils import ClassMap, DataContainer diff --git a/atom/data/dataengines.py b/atom/data/dataengines.py index a72a558a9..113163f2c 100644 --- a/atom/data/dataengines.py +++ b/atom/data/dataengines.py @@ -9,22 +9,24 @@ import os from abc import ABCMeta, abstractmethod +from typing import TYPE_CHECKING import numpy as np import pandas as pd -import polars as pl -from polars.dependencies import _lazy_import -from atom.utils.types import Any, Pandas, Sequence -from atom.utils.utils import get_cols +from atom.utils.types import Any, Pandas -os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" +if TYPE_CHECKING: + import dask.dataframe as dd + import modin.pandas as md + import polars as pl + import pyarrow as pa + import pyspark.pandas as ps + -dd, _ = _lazy_import("dask.dataframe") -md, _ = _lazy_import("modin.pandas") -pa, _ = _lazy_import("pyarrow") -ps, _ = _lazy_import("pyspark") +# Avoid warning about pyarrow timezones not set +os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" class DataEngine(metaclass=ABCMeta): @@ -37,7 +39,9 @@ class DataEngine(metaclass=ABCMeta): @staticmethod @abstractmethod - def convert(obj: Pandas) -> np.ndarray | Sequence[Any] | pd.DataFrame: ... + def convert(obj: Pandas) -> Any: + """Convert to data engine output types.""" + pass class NumpyEngine(DataEngine): @@ -70,14 +74,20 @@ class PandasPyarrowEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> Pandas: """Convert to pyarrow dtypes.""" - return obj.astype( - { - col.name: pd.ArrowDtype( - pa.from_numpy_dtype(getattr(col.dtype, "numpy_dtype", col.dtype)) - ) - for col in get_cols(obj) - } - ) + from pyarrow import from_numpy_dtype + + if isinstance(obj, pd.DataFrame): + return obj.astype( + { + c: pd.ArrowDtype(from_numpy_dtype(d)) if isinstance(d, np.dtype) else d + for c, d in obj.dtypes.items() + } + ) + else: + return obj.astype( + pd.ArrowDtype(from_numpy_dtype(obj.dtype)) + if isinstance(obj.dtype, np.dtype) else obj.dtype + ) class PolarsEngine(DataEngine): @@ -88,9 +98,11 @@ class PolarsEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> pl.Series | pl.DataFrame: """Convert to polars objects.""" + import polars as pl + if isinstance(obj, pd.DataFrame): return pl.DataFrame(obj) - elif isinstance(obj, pd.Series): + else: return pl.Series(obj) @@ -100,11 +112,13 @@ class PolarsLazyEngine(DataEngine): library = "polars" @staticmethod - def convert(obj: Pandas) -> pl.Series | pl.DataFrame: + def convert(obj: Pandas) -> pl.Series | pl.LazyFrame: """Convert to lazy polars objects.""" + import polars as pl + if isinstance(obj, pd.DataFrame): return pl.LazyFrame(obj) - elif isinstance(obj, pd.Series): + else: return pl.Series(obj) @@ -116,9 +130,11 @@ class PyArrowEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> pa.Array | pa.Table: """Convert to pyarrow objects.""" + import pyarrow as pa + if isinstance(obj, pd.DataFrame): return pa.Table.from_pandas(obj) - elif isinstance(obj, pd.Series): + else: return pa.Array.from_pandas(obj) @@ -130,9 +146,11 @@ class ModinEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> md.Series | md.DataFrame: """Convert to modin objects.""" + import modin.pandas as md + if isinstance(obj, pd.DataFrame): return md.DataFrame(obj) - elif isinstance(obj, pd.Series): + else: return md.Series(obj) @@ -144,7 +162,9 @@ class DaskEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> dd.Series | dd.DataFrame: """Convert to dask objects.""" - return dd.from_pandas(obj, npartitions=max(1, len(obj) // 1e6)) + import dask.dataframe as dd + + return dd.from_pandas(obj, npartitions=int(max(1, len(obj) // 1e6))) class PySparkEngine(DataEngine): @@ -155,7 +175,9 @@ class PySparkEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> ps.sql.DataFrame: """Convert to pyspark objects.""" - spark = ps.sql.SparkSession.builder.appName("atom-ml").getOrCreate() + from pyspark.sql import SparkSession + + spark = SparkSession.builder.appName("atom-ml").getOrCreate() return spark.createDataFrame(obj) @@ -167,9 +189,11 @@ class PySparkPandasEngine(DataEngine): @staticmethod def convert(obj: Pandas) -> ps.pandas.Series | ps.pandas.DataFrame: """Convert to pyspark objects.""" + import pyspark.pandas as ps + if isinstance(obj, pd.DataFrame): return ps.pandas.DataFrame(obj) - elif isinstance(obj, pd.Series): + else: return ps.pandas.Series(obj) diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index a7da4778a..fde80861e 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -16,7 +16,22 @@ import pandas as pd import sklearn from beartype import beartype -from polars.dependencies import _lazy_import +from category_encoders import ( + BackwardDifferenceEncoder, BaseNEncoder, BinaryEncoder, CatBoostEncoder, + HelmertEncoder, JamesSteinEncoder, MEstimateEncoder, OneHotEncoder, + OrdinalEncoder, PolynomialEncoder, SumEncoder, TargetEncoder, WOEEncoder, +) +from imblearn.combine import SMOTEENN, SMOTETomek +from imblearn.over_sampling import ( + ADASYN, SMOTE, SMOTEN, SMOTENC, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE, + RandomOverSampler, +) +from imblearn.under_sampling import ( + AllKNN, CondensedNearestNeighbour, EditedNearestNeighbours, + InstanceHardnessThreshold, NearMiss, NeighbourhoodCleaningRule, + OneSidedSelection, RandomUnderSampler, RepeatedEditedNearestNeighbours, + TomekLinks, +) from scipy.stats import zscore from sklearn.base import ( BaseEstimator, OneToOneFeatureMixin, _clone_parametrized, @@ -25,6 +40,7 @@ from sklearn.experimental import enable_iterative_imputer # noqa: F401 from sklearn.impute import IterativeImputer, KNNImputer from sklearn.utils.validation import _check_feature_names_in +from sktime.transformations.series.impute import Imputer from typing_extensions import Self from atom.basetransformer import BaseTransformer @@ -35,7 +51,7 @@ IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels, Sequence, Transformer, Verbose, XConstructor, - YConstructor, sequence_t, + YConstructor, sequence_t, EngineEstimatorOptions ) from atom.utils.utils import ( Goal, check_is_fitted, get_col_names, get_col_order, get_cols, it, lst, @@ -44,11 +60,6 @@ ) -category_encoders, _ = _lazy_import("category_encoders") -imblearn, _ = _lazy_import("imblearn") -sktime, _ = _lazy_import("sktime") - - T_Transformer = TypeVar("T_Transformer", bound=Transformer) @@ -189,7 +200,11 @@ def inverse_transform( check_is_fitted(self) Xt = to_df(X, columns=self.feature_names_in_) - yt = to_tabular(y, index=Xt.index, columns=getattr(y, "target_names_in_", None)) + yt = to_tabular( + data=y, + index=getattr(Xt, "index", None), + columns=getattr(y, "target_names_in_", None), + ) return variable_return(self._convert(Xt), self._convert(yt)) @@ -224,10 +239,9 @@ def set_output(self, *, transform: EngineDataOptions | None = None) -> Self: Estimator instance. """ - if transform is None: - return self + if transform is not None: + self._engine = getattr(self, "_engine", EngineTuple()).data = transform - self.engine = getattr(self, "engine", EngineTuple()).data = transform return self @@ -395,28 +409,28 @@ def fit(self, X: XConstructor, y: YConstructor) -> Self: else: raise ValueError("The Balancer class does not support multioutput tasks.") + # ClusterCentroids is unavailable since it has no sample_indices_ strategies = { - # clustercentroids=imblearn.under_sampling.ClusterCentroids, # noqa: ERA001 (has no sample_indices_) - "condensednearestneighbour": imblearn.under_sampling.CondensedNearestNeighbour, - "editednearestneighborus": imblearn.under_sampling.EditedNearestNeighbours, - "repeatededitednearestneighbours": imblearn.under_sampling.RepeatedEditedNearestNeighbours, - "allknn": imblearn.under_sampling.AllKNN, - "instancehardnessthreshold": imblearn.under_sampling.InstanceHardnessThreshold, - "nearmiss": imblearn.under_sampling.NearMiss, - "neighbourhoodcleaningrule": imblearn.under_sampling.NeighbourhoodCleaningRule, - "onesidedselection": imblearn.under_sampling.OneSidedSelection, - "randomundersampler": imblearn.under_sampling.RandomUnderSampler, - "tomeklinks": imblearn.under_sampling.TomekLinks, - "randomoversampler": imblearn.over_sampling.RandomOverSampler, - "smote": imblearn.over_sampling.SMOTE, - "smotenc": imblearn.over_sampling.SMOTENC, - "smoten": imblearn.over_sampling.SMOTEN, - "adasyn": imblearn.over_sampling.ADASYN, - "borderlinesmote": imblearn.over_sampling.BorderlineSMOTE, - "kmeanssmote": imblearn.over_sampling.KMeansSMOTE, - "svmsmote": imblearn.over_sampling.SVMSMOTE, - "smoteenn": imblearn.combine.SMOTEENN, - "smotetomek": imblearn.combine.SMOTETomek, + "condensednearestneighbour": CondensedNearestNeighbour, + "editednearestneighborus": EditedNearestNeighbours, + "repeatededitednearestneighbours": RepeatedEditedNearestNeighbours, + "allknn": AllKNN, + "instancehardnessthreshold": InstanceHardnessThreshold, + "nearmiss": NearMiss, + "neighbourhoodcleaningrule": NeighbourhoodCleaningRule, + "onesidedselection": OneSidedSelection, + "randomundersampler": RandomUnderSampler, + "tomeklinks": TomekLinks, + "randomoversampler": RandomOverSampler, + "smote": SMOTE, + "smotenc": SMOTENC, + "smoten": SMOTEN, + "adasyn": ADASYN, + "borderlinesmote": BorderlineSMOTE, + "kmeanssmote": KMeansSMOTE, + "svmsmote": SVMSMOTE, + "smoteenn": SMOTEENN, + "smotetomek": SMOTETomek, } if isinstance(self.strategy, str): @@ -698,7 +712,7 @@ def __init__( drop_missing_target: Bool = True, encode_target: Bool = True, device: str = "cpu", - engine: Engine = None, + engine: EngineEstimatorOptions = None, verbose: Verbose = 0, ): super().__init__(device=device, engine=engine, verbose=verbose) @@ -1172,18 +1186,18 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: self._estimators: dict[Hashable, tuple[Transformer, Transformer]] = {} for name, column in Xt.select_dtypes(include="number").items(): - trend = sktime.transformations.series.detrend.Detrender( + trend = Detrender( forecaster=forecaster, model=self.trend_model, ).fit(column) if self.test_seasonality: - season = sktime.transformations.series.detrend.ConditionalDeseasonalizer( + season = ConditionalDeseasonalizer( sp=self.sp or 1, model=self.seasonal_model, ).fit(trend.transform(column)) else: - season = sktime.transformations.series.detrend.Deseasonalizer( + season = Deseasonalizer( sp=self.sp or 1, model=self.seasonal_model, ).fit(trend.transform(column)) @@ -1392,7 +1406,7 @@ def __init__( bins: Bins = 5, labels: Sequence[str] | dict[str, Sequence[str]] | None = None, device: str = "cpu", - engine: Engine = None, + engine: EngineEstimatorOptions = None, verbose: Verbose = 0, random_state: IntLargerEqualZero | None = None, ): @@ -1756,18 +1770,18 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: self._categories = {} strategies = { - "backwarddifference": category_encoders.BackwardDifferenceEncoder, - "basen": category_encoders.BaseNEncoder, - "binary": category_encoders.BinaryEncoder, - "catboost": category_encoders.CatBoostEncoder, - "helmert": category_encoders.HelmertEncoder, - "jamesstein": category_encoders.JamesSteinEncoder, - "mestimate": category_encoders.MEstimateEncoder, - "ordinal": category_encoders.OrdinalEncoder, - "polynomial": category_encoders.PolynomialEncoder, - "sum": category_encoders.SumEncoder, - "target": category_encoders.TargetEncoder, - "woe": category_encoders.WOEEncoder, + "backwarddifference": BackwardDifferenceEncoder, + "basen": BaseNEncoder, + "binary": BinaryEncoder, + "catboost": CatBoostEncoder, + "helmert": HelmertEncoder, + "jamesstein": JamesSteinEncoder, + "mestimate": MEstimateEncoder, + "ordinal": OrdinalEncoder, + "polynomial": PolynomialEncoder, + "sum": SumEncoder, + "target": TargetEncoder, + "woe": WOEEncoder, } Xt = to_df(X) @@ -2107,7 +2121,7 @@ def __init__( max_nan_cols: FloatLargerZero | None = None, n_jobs: NJobs = 1, device: str = "cpu", - engine: Engine = None, + engine: EngineEstimatorOptions = None, verbose: Verbose = 0, random_state: IntLargerEqualZero | None = None, ): @@ -2194,8 +2208,7 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: elif self.strat_num == "drop": num_imputer = "passthrough" else: - sktimeImputer = make_sklearn(sktime.transformations.series.impute.Imputer) - num_imputer = sktimeImputer( + num_imputer = make_sklearn(Imputer)( method=self.strat_num, missing_values=[pd.NA], random_state=self.random_state, @@ -2500,7 +2513,7 @@ def __init__( strategy: NormalizerStrats = "yeojohnson", *, device: str = "cpu", - engine: Engine = None, + engine: EngineEstimatorOptions = None, verbose: Verbose = 0, random_state: IntLargerEqualZero | None = None, **kwargs, @@ -2778,7 +2791,7 @@ def __init__( max_sigma: FloatLargerZero = 3, include_target: Bool = False, device: str = "cpu", - engine: Engine = None, + engine: EngineEstimatorOptions = None, verbose: Verbose = 0, **kwargs, ): @@ -3039,7 +3052,7 @@ def __init__( *, include_binary: Bool = False, device: str = "cpu", - engine: Engine = None, + engine: EngineEstimatorOptions = None, verbose: Verbose = 0, **kwargs, ): diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index c9a1fce81..8a5b34b39 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -16,7 +16,6 @@ import pandas as pd from beartype import beartype from gplearn.genetic import SymbolicTransformer -from polars.dependencies import _lazy_import from scipy import stats from sklearn.base import is_classifier from sklearn.feature_selection import ( @@ -26,14 +25,18 @@ from sklearn.model_selection import cross_val_score from sklearn.utils.validation import _check_feature_names_in from typing_extensions import Self +from zoofs import ( + DragonFlyOptimization, GeneticOptimization, GreyWolfOptimization, + HarrisHawkOptimization, ParticleSwarmOptimization, +) from atom.basetransformer import BaseTransformer from atom.data_cleaning import Scaler, TransformerMixin from atom.utils.types import ( Bool, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, - IntLargerEqualZero, IntLargerZero, NJobs, Operators, Pandas, Scalar, - Sequence, Verbose, XConstructor, YConstructor, + IntLargerEqualZero, IntLargerZero, NJobs, Operators, Scalar, Sequence, + Verbose, XConstructor, YConstructor, EngineEstimatorOptions ) from atom.utils.utils import ( Goal, Task, check_is_fitted, check_scaling, get_custom_scorer, is_sparse, @@ -41,9 +44,6 @@ ) -zoofs, _ = _lazy_import("zoofs") - - @beartype class FeatureExtractor(TransformerMixin): """Extract features from datetime columns. @@ -438,7 +438,7 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """ Xt = to_df(X) - yt = to_tabular(y, index=getattr(Xt, "index", None)) + yt = to_tabular(y, index=Xt.index) self._check_feature_names(Xt, reset=True) self._check_n_features(Xt, reset=True) @@ -529,11 +529,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr if self.strategy == "dfs": es = ft.EntitySet(dataframes={"X": (Xt, "index", None, None, None, True)}) - dfs = ft.calculate_feature_matrix( - features=self._dfs, - entityset=es, - n_jobs=self.n_jobs, - ) + dfs = ft.calculate_feature_matrix(self._dfs, entityset=es, n_jobs=self.n_jobs) # Add the new features to the feature set Xt = pd.concat([Xt, dfs], axis=1).set_index("index") @@ -989,7 +985,7 @@ def __init__( max_correlation: FloatZeroToOneInc | None = 1.0, n_jobs: NJobs = 1, device: str = "cpu", - engine: Engine = None, + engine: EngineEstimatorOptions = None, verbose: Verbose = 0, random_state: IntLargerEqualZero | None = None, **kwargs, @@ -1050,7 +1046,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): return scoring(model, X_valid, y_valid) Xt = to_df(X) - tt = to_tabular(y, index=Xt.index) + yt = to_tabular(y, index=Xt.index) self._check_feature_names(Xt, reset=True) self._check_n_features(Xt, reset=True) @@ -1064,20 +1060,6 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): self._estimator: Any = None self._n_features = None - strategies = { - "univariate": "SelectKBest", - "pca": "PCA", - "sfm": "SelectFromModel", - "sfs": "SequentialFeatureSelector", - "rfe": "RFE", - "rfecv": "RFECV", - "pso": zoofs.ParticleSwarmOptimization, - "hho": zoofs.HarrisHawkOptimization, - "gwo": zoofs.GreyWolfOptimization, - "dfo": zoofs.DragonFlyOptimization, - "go": zoofs.GeneticOptimization, - } - if isinstance(self.strategy, str): if self.strategy not in ("univariate", "pca"): solver: FeatureSelectionSolvers @@ -1365,6 +1347,13 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): else: check_y() + strategies = { + "pso": ParticleSwarmOptimization, + "hho": HarrisHawkOptimization, + "gwo": GreyWolfOptimization, + "dfo": DragonFlyOptimization, + "go": GeneticOptimization, + } # Either use a provided validation set or cross-validation over X if "X_valid" in kwargs: diff --git a/atom/models/classreg.py b/atom/models/classreg.py index 41250d822..02dffde14 100644 --- a/atom/models/classreg.py +++ b/atom/models/classreg.py @@ -7,6 +7,7 @@ from __future__ import annotations +from collections.abc import Mapping from typing import Any, ClassVar, cast import numpy as np @@ -76,7 +77,7 @@ class AdaBoost(BaseModel): "regression": "sklearn.ensemble.AdaBoostRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -734,7 +735,7 @@ class DecisionTree(BaseModel): "regression": "sklearn.tree.DecisionTreeRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -809,7 +810,7 @@ class Dummy(BaseModel): "regression": "sklearn.dummy.DummyRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -943,7 +944,7 @@ class ExtraTree(BaseModel): "regression": "sklearn.tree.ExtraTreeRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1038,7 +1039,7 @@ def _get_parameters(self, trial: Trial) -> dict: return params - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1229,7 +1230,7 @@ class GradientBoostingMachine(BaseModel): "regression": "sklearn.ensemble.GradientBoostingRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1376,7 +1377,7 @@ class HistGradientBoosting(BaseModel): "regression": "sklearn.ensemble.HistGradientBoostingRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1452,7 +1453,7 @@ class KNearestNeighbors(BaseModel): "regression": "sklearn.neighbors.KNeighborsRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1951,7 +1952,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: else: return super()._get_est(params) - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2053,7 +2054,7 @@ def _get_parameters(self, trial: Trial) -> dict: return params - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2161,7 +2162,7 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]: return params - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2400,7 +2401,7 @@ class PassiveAggressive(BaseModel): "regression": "sklearn.linear_model.PassiveAggressiveRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2712,7 +2713,7 @@ def _get_parameters(self, trial: Trial) -> dict: return params - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2805,7 +2806,7 @@ class Ridge(BaseModel): "regression": "sklearn.linear_model.Ridge", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -2878,7 +2879,7 @@ class StochasticGradientDescent(BaseModel): "regression": "sklearn.linear_model.SGDRegressor", } - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -3003,7 +3004,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: else: return super()._get_est(params) - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns diff --git a/atom/models/ts.py b/atom/models/ts.py index 59f078f1b..f067eb5d9 100644 --- a/atom/models/ts.py +++ b/atom/models/ts.py @@ -7,6 +7,7 @@ from __future__ import annotations +from collections.abc import Mapping from logging import ERROR, WARNING, getLogger from typing import Any, ClassVar @@ -161,7 +162,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: """ return super()._get_est({"suppress_warnings": self.warnings == "ignore"} | params) - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -835,7 +836,7 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]: return {"stl_kwargs": self._est_params.get("stl_kwargs", {}) | params} - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1207,7 +1208,7 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]: return params - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns @@ -1652,7 +1653,7 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: """ return super()._get_est({"suppress_warnings": self.warnings == "ignore"} | params) - def _get_distributions(self) -> dict[str, BaseDistribution]: + def _get_distributions(self) -> Mapping[str, BaseDistribution]: """Get the predefined hyperparameter distributions. Returns diff --git a/atom/nlp.py b/atom/nlp.py index 6c855d343..221ca7029 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -10,11 +10,11 @@ import re import unicodedata from string import punctuation +from typing import TYPE_CHECKING import numpy as np import pandas as pd from beartype import beartype -from polars.dependencies import _lazy_import from sklearn.base import OneToOneFeatureMixin from sklearn.utils.validation import _check_feature_names_in from typing_extensions import Self @@ -22,14 +22,15 @@ from atom.data_cleaning import TransformerMixin from atom.utils.types import ( Bool, Engine, FloatLargerZero, Sequence, VectorizerStarts, Verbose, - XConstructor, YConstructor, bool_t, + XConstructor, YConstructor, bool_t, EngineEstimatorOptions ) from atom.utils.utils import ( check_is_fitted, check_nltk_module, get_corpus, is_sparse, merge, to_df, ) -nltk, _ = _lazy_import("nltk") +if TYPE_CHECKING: + from nltk.corpus import wordnet @beartype @@ -459,7 +460,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr """ - def pos(tag: str) -> nltk.corpus.wordnet: + def pos(tag: str) -> wordnet.ADJ | wordnet.ADV | wordnet.VERB | wordnet.NOUN: """Get part of speech from a tag. Parameters @@ -474,13 +475,17 @@ def pos(tag: str) -> nltk.corpus.wordnet: """ if tag in ("JJ", "JJR", "JJS"): - return nltk.corpus.wordnet.ADJ + return wordnet.ADJ elif tag in ("RB", "RBR", "RBS"): - return nltk.corpus.wordnet.ADV + return wordnet.ADV elif tag in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): - return nltk.corpus.wordnet.VERB + return wordnet.VERB else: # "NN", "NNS", "NNP", "NNPS" - return nltk.corpus.wordnet.NOUN + return wordnet.NOUN + + from nltk import pos_tag + from nltk.corpus import stopwords, wordnet + from nltk.stem import SnowballStemmer, WordNetLemmatizer Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) corpus = get_corpus(Xt) @@ -491,22 +496,22 @@ def pos(tag: str) -> nltk.corpus.wordnet: if isinstance(Xt[corpus].iloc[0], str): Xt[corpus] = Xt[corpus].apply(lambda row: row.split()) - stopwords = set() + stop_words = set() if self.stopwords: if isinstance(self.stopwords, bool_t): self.stopwords = "english" # Get stopwords from the NLTK library check_nltk_module("corpora/stopwords", quiet=self.verbose < 2) - stopwords = set(nltk.corpus.stopwords.words(self.stopwords.lower())) + stop_words = set(stopwords.words(self.stopwords.lower())) # Join predefined with customs stopwords if self.custom_stopwords is not None: - stopwords = stopwords | set(self.custom_stopwords) + stop_words = stop_words | set(self.custom_stopwords) - if stopwords: + if stop_words: self._log(" --> Dropping stopwords.", 2) - f = lambda row: [word for word in row if word not in stopwords] + f = lambda row: [word for word in row if word not in stop_words] Xt[corpus] = Xt[corpus].apply(f) if self.stem: @@ -514,7 +519,7 @@ def pos(tag: str) -> nltk.corpus.wordnet: self.stem = "english" self._log(" --> Applying stemming.", 2) - ss = nltk.stem.SnowballStemmer(language=self.stem.lower()) + ss = SnowballStemmer(language=self.stem.lower()) Xt[corpus] = Xt[corpus].apply(lambda row: [ss.stem(word) for word in row]) if self.lemmatize: @@ -523,8 +528,8 @@ def pos(tag: str) -> nltk.corpus.wordnet: check_nltk_module("taggers/averaged_perceptron_tagger", quiet=self.verbose < 2) check_nltk_module("corpora/omw-1.4", quiet=self.verbose < 2) - wnl = nltk.stem.WordNetLemmatizer() - f = lambda row: [wnl.lemmatize(w, pos(tag)) for w, tag in nltk.pos_tag(row)] + wnl = WordNetLemmatizer() + f = lambda row: [wnl.lemmatize(w, pos(tag)) for w, tag in pos_tag(row)] Xt[corpus] = Xt[corpus].apply(f) return self._convert(Xt) @@ -706,6 +711,9 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: return row_c[2:-2].split(sep) + import nltk.collocations as collocations + from nltk import word_tokenize + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) corpus = get_corpus(Xt) @@ -713,12 +721,12 @@ def replace_ngrams(row: list[str], ngram: tuple[str]) -> list[str]: if isinstance(Xt[corpus].iloc[0], str): check_nltk_module("tokenizers/punkt", quiet=self.verbose < 2) - Xt[corpus] = Xt[corpus].apply(lambda row: nltk.word_tokenize(row)) + Xt[corpus] = Xt[corpus].apply(lambda row: word_tokenize(row)) ngrams = { - "bigrams": nltk.collocations.BigramCollocationFinder, - "trigrams": nltk.collocations.TrigramCollocationFinder, - "quadgrams": nltk.collocations.QuadgramCollocationFinder, + "bigrams": collocations.BigramCollocationFinder, + "trigrams": collocations.TrigramCollocationFinder, + "quadgrams": collocations.QuadgramCollocationFinder, } for attr, finder in ngrams.items(): @@ -878,7 +886,7 @@ def __init__( *, return_sparse: Bool = True, device: str = "cpu", - engine: Engine = None, + engine: EngineEstimatorOptions = None, verbose: Verbose = 0, **kwargs, ): diff --git a/atom/pipeline.py b/atom/pipeline.py index 156bcd45d..0ef061b19 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -9,7 +9,7 @@ from collections.abc import Iterator from itertools import islice -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal import numpy as np import pandas as pd @@ -23,7 +23,6 @@ from sklearn.utils.metaestimators import available_if from sklearn.utils.validation import check_memory from sktime.forecasting.base import BaseForecaster -from sktime.proba.normal import Normal from typing_extensions import Self from atom.utils.types import ( @@ -31,11 +30,15 @@ Pandas, Scalar, Sequence, Verbose, XConstructor, YConstructor, ) from atom.utils.utils import ( - NotFittedError, adjust_verbosity, check_is_fitted, fit_one, + NotFittedError, adjust, check_is_fitted, fit_one, fit_transform_one, transform_one, variable_return, ) +if TYPE_CHECKING: + from sktime.proba.normal import Normal + + class Pipeline(SkPipeline): """Pipeline of transforms with a final estimator. @@ -223,6 +226,26 @@ def _can_inverse_transform(self) -> bool: for _, _, est in self._iter() ) + def _convert(self, obj: Pandas | None) -> Any: + """Convert data to the type set in the data engine. + + Parameters + ---------- + obj: pd.Series, pd.DataFrame or None + Object to convert. If None, return as is. + + Returns + ------- + object + Converted data. + + """ + # Only apply transformations when the engine is defined + if hasattr(self, "_engine") and isinstance(obj, pd.Series | pd.DataFrame): + return self._engine.data_engine.convert(obj) + else: + return obj + def _iter( self, *, @@ -284,7 +307,7 @@ def _fit( Feature set with shape=(n_samples, n_features). If None, `X` is ignored. None if the pipeline only uses y. - y: dict, sequence, dataframe or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. routed_params: dict or None, default=None @@ -320,7 +343,7 @@ def _fit( if hasattr(transformer, attr): setattr(cloned, attr, getattr(transformer, attr)) - with adjust_verbosity(cloned, self._verbose): + with adjust(cloned, verbose=self._verbose): # Fit or load the current estimator from cache # Type ignore because routed_params is never None but # the signature of _fit needs to comply with sklearn's @@ -432,7 +455,7 @@ def fit( Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: dict, sequence, dataframe or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. **params @@ -451,7 +474,7 @@ def fit( with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator is not None and self._final_estimator != "passthrough": - with adjust_verbosity(self._final_estimator, self._verbose): + with adjust(self._final_estimator, self._verbose): self._mem_fit( estimator=self._final_estimator, X=X, @@ -484,7 +507,7 @@ def fit_transform( `X` is ignored. None if the estimator only uses y. - y: dict, sequence, dataframe or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. **params @@ -508,7 +531,7 @@ def fit_transform( if self._final_estimator is None or self._final_estimator == "passthrough": return variable_return(X, y) - with adjust_verbosity(self._final_estimator, self._verbose): + with adjust(self._final_estimator, verbose=self._verbose): X, y, _ = self._mem_fit_transform( transformer=self._final_estimator, X=X, @@ -516,7 +539,7 @@ def fit_transform( **routed_params[self.steps[-1][0]].fit_transform, ) - return variable_return(X, y) + return variable_return(self._convert(X), self._convert(y)) @available_if(_can_transform) def transform( @@ -542,7 +565,7 @@ def transform( Feature set with shape=(n_samples, n_features). If None, `X` is ignored. None if the pipeline only uses y. - y: dict, sequence, dataframe or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. filter_train_only: bool, default=True @@ -570,7 +593,7 @@ def transform( routed_params = process_routing(self, "transform", **params) for _, name, transformer in self._iter(filter_train_only=filter_train_only): - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, y = self._mem_transform( transformer=transformer, X=X, @@ -578,7 +601,7 @@ def transform( **routed_params[name].transform, ) - return variable_return(X, y) + return variable_return(self._convert(X), self._convert(y)) @available_if(_can_inverse_transform) def inverse_transform( @@ -600,7 +623,7 @@ def inverse_transform( Feature set with shape=(n_samples, n_features). If None, `X` is ignored. None if the pipeline only uses y. - y: dict, sequence, dataframe or None, default=None + y: sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. filter_train_only: bool, default=True @@ -629,7 +652,7 @@ def inverse_transform( routed_params = process_routing(self, "inverse_transform", **params) reverse_iter = reversed(list(self._iter(filter_train_only=filter_train_only))) for _, name, transformer in reverse_iter: - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, y = self._mem_transform( transformer=transformer, X=X, @@ -638,7 +661,7 @@ def inverse_transform( **routed_params[name].inverse_transform, ) - return variable_return(X, y) + return variable_return(self._convert(X), self._convert(y)) @available_if(_final_estimator_has("decision_function")) def decision_function(self, X: XConstructor, **params) -> np.ndarray: @@ -668,7 +691,7 @@ def decision_function(self, X: XConstructor, **params) -> np.ndarray: routed_params = process_routing(self, "decision_function", **params) for _, name, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, _ = self._mem_transform( transformer=transformer, X=X, @@ -720,7 +743,7 @@ def predict( routed_params = process_routing(self, "predict", **params) for _, name, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, _ = self._mem_transform(transformer, X, **routed_params[name].transform) if isinstance(self._final_estimator, BaseForecaster): @@ -738,7 +761,7 @@ def predict_interval( X: XConstructor | None = None, *, coverage: Float | Sequence[Float] = 0.9, - ) -> Pandas: + ) -> pd.DataFrame: """Transform, then predict_quantiles of the final estimator. Parameters @@ -760,7 +783,7 @@ def predict_interval( """ for _, _, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, y = self._mem_transform(transformer, X) return self.steps[-1][1].predict_interval(fh=fh, X=X, coverage=coverage) @@ -789,7 +812,7 @@ def predict_log_proba(self, X: XConstructor, **params) -> np.ndarray: routed_params = process_routing(self, "predict_log_proba", **params) for _, name, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, _ = self._mem_transform(transformer, X, **routed_params[name].transform) return self.steps[-1][1].predict_log_proba( @@ -843,7 +866,7 @@ def predict_proba( routed_params = process_routing(self, "predict_proba", **params) for _, name, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, _ = self._mem_transform(transformer, X, **routed_params[name].transform) if isinstance(self._final_estimator, BaseForecaster): @@ -886,7 +909,7 @@ def predict_quantiles( """ for _, _, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, y = self._mem_transform(transformer, X) return self.steps[-1][1].predict_quantiles(fh=fh, X=X, alpha=alpha) @@ -915,7 +938,7 @@ def predict_residuals( """ for _, _, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, y = self._mem_transform(transformer, X, y) return self.steps[-1][1].predict_residuals(y=y, X=X) @@ -950,12 +973,12 @@ def predict_var( """ for _, _, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, _ = self._mem_transform(transformer, X) return self.steps[-1][1].predict_var(fh=fh, X=X, cov=cov) - def set_output(self, *, transform: EngineDataOptions | None = None): + def set_output(self, *, transform: EngineDataOptions | None = None) -> Self: """Set output container. See sklearn's [user guide][set_output] on how to use the @@ -986,10 +1009,9 @@ def set_output(self, *, transform: EngineDataOptions | None = None): Estimator instance. """ - if transform is None: - return self + if transform is not None: + self._engine = EngineTuple(data=transform) - self.engine = getattr(self, "engine", EngineTuple()).data = transform return self @available_if(_final_estimator_has("score")) @@ -1010,7 +1032,7 @@ def score( Feature set with shape=(n_samples, n_features). Can only be `None` for [forecast][time-series] tasks. - y: dict, sequence, dataframe or None, default=None + y: sequence, dataframe-like or None, default=None Target values corresponding to `X`. fh: int, sequence, [ForecastingHorizon][] or None, default=None @@ -1038,7 +1060,7 @@ def score( routed_params = process_routing(self, "score", **params) for _, name, transformer in self._iter(with_final=False): - with adjust_verbosity(transformer, self._verbose): + with adjust(transformer, verbose=self._verbose): X, y = self._mem_transform(transformer, X, y, **routed_params[name].transform) if isinstance(self._final_estimator, BaseForecaster): diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py index 793653bef..3764e517a 100644 --- a/atom/plots/baseplot.py +++ b/atom/plots/baseplot.py @@ -28,7 +28,7 @@ from atom.utils.types import ( Bool, FloatLargerZero, FloatZeroToOneExc, Int, IntLargerZero, Legend, MetricSelector, Model, ModelsSelector, PlotBackend, RowSelector, Scalar, - Sequence, int_t, sequence_t, + Sequence, int_t, sequence_t, Pandas ) from atom.utils.utils import ( Aesthetics, check_is_fitted, composed, crash, get_custom_scorer, lst, @@ -140,7 +140,7 @@ def marker_size(self, value: FloatLargerZero): # Methods ====================================================== >> @staticmethod - def _get_plot_index(df: pd.DataFrame) -> pd.Index: + def _get_plot_index(obj: Pandas) -> pd.Index: """Return the dataset's index in a plottable format. Plotly does not accept all index formats (e.g., pd.Period), @@ -149,19 +149,19 @@ def _get_plot_index(df: pd.DataFrame) -> pd.Index: Parameters ---------- - df: dataframe + obj: pd.Series or pd.DataFrame Data set to get the index from. Returns ------- - index + pd.Index Index in an acceptable format. """ - if hasattr(df.index, "to_timestamp"): - return df.index.to_timestamp() + if hasattr(obj.index, "to_timestamp"): + return obj.index.to_timestamp() else: - return df.index + return obj.index @staticmethod def _get_show(show: IntLargerZero | None, maximum: IntLargerZero = 200) -> Int: diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py index 016d4bced..4ccb1782e 100644 --- a/atom/plots/predictionplot.py +++ b/atom/plots/predictionplot.py @@ -39,7 +39,7 @@ Bool, ColumnSelector, FloatZeroToOneExc, Int, IntLargerEqualZero, IntLargerFour, IntLargerZero, Kind, Legend, MetricConstructor, MetricSelector, ModelsSelector, RowSelector, Sequence, TargetSelector, - TargetsSelector, XSelector, + TargetsSelector, XSelector, XConstructor ) from atom.utils.utils import ( Task, check_canvas, check_dependency, check_empty, check_predict_proba, @@ -1116,7 +1116,7 @@ def plot_forecast( self, models: ModelsSelector = None, fh: RowSelector | ForecastingHorizon = "dataset", - X: XSelector | None = None, + X: XConstructor | None = None, target: TargetSelector = 0, *, plot_insample: Bool = False, @@ -1232,18 +1232,20 @@ def plot_forecast( for m in models_c: if X is not None: - X = m.transform(X) + Xt = m.transform(X) elif isinstance(fh, pd.Index): - X = m.branch._all.loc[fh] + Xt = m.branch._all.loc[fh] + else: + Xt = X # Draw predictions and interval - y_pred = m.predict(fh=fh, X=check_empty(X)) + y_pred = m.predict(fh=fh, X=check_empty(Xt)) if self.task.is_multioutput: y_pred = y_pred[target_c] if not plot_insample: idx = y_pred.index.intersection(m.branch.train.index) - y_pred.loc[idx] = np.NaN # type: ignore[index] + y_pred.loc[idx] = np.NaN # type: ignore[call-overload] y_true = m.branch._all.loc[y_pred.index, target_c] @@ -1271,7 +1273,7 @@ def plot_forecast( if plot_interval: try: - y_interval = m.predict_interval(fh=fh, X=X) + y_interval = m.predict_interval(fh=fh, X=Xt) except (AttributeError, NotImplementedError): continue # Fails for some models like ES diff --git a/atom/utils/types.py b/atom/utils/types.py index bfb01c9a3..3c508e710 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -24,7 +24,7 @@ if TYPE_CHECKING: - from atom.branch.dataengines import DataEngine + from atom.data.dataengines import DataEngine from atom.utils.utils import Goal @@ -120,7 +120,7 @@ def __repr__(self) -> str: @property def data_engine(self) -> DataEngine: """Return the data engine.""" - from atom.branch.dataengines import DATA_ENGINES + from atom.data import DATA_ENGINES return DATA_ENGINES[self.data]() @@ -184,7 +184,6 @@ class Model(Protocol): """Protocol for all models.""" _goal: Goal - _metric: ClassMap _ht: dict[str, Any] def predict(self, *args, **kwargs) -> Pandas: ... @@ -217,7 +216,7 @@ def predict(self, *args, **kwargs) -> Pandas: ... | Sequence[Sequence[Any]] | Iterable[Sequence[Any] | tuple[Hashable, Sequence[Any]] | dict[str, Sequence[Any]]] | np.ndarray - | sps.spmatrix + | sps.spmatrix # scipy has no stubs, thus this becomes Any | pd.DataFrame ) XSelector: TypeAlias = XConstructor | Callable[..., XConstructor] @@ -346,6 +345,10 @@ def predict(self, *args, **kwargs) -> Pandas: ... ] # Others +XDatasets: TypeAlias = Literal[ + "dataset", "train", "test", "holdout", "X", "X_train", "X_test", "X_holdout" +] +YDatasets: TypeAlias = Literal["y", "y_train", "y_test", "y_holdout"] Seasonality: TypeAlias = IntLargerOne | str | Sequence[IntLargerOne | str] | None SeasonalityModels: TypeAlias = Literal["additive", "multiplicative"] FeatureNamesOut: TypeAlias = ( diff --git a/atom/utils/utils.py b/atom/utils/utils.py index ab42d7900..2ba27461c 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -18,27 +18,20 @@ from enum import Enum, IntEnum from functools import cached_property, wraps from importlib import import_module +from importlib.util import find_spec from inspect import Parameter, signature from itertools import cycle -from types import GeneratorType, MappingProxyType, ModuleType +from types import GeneratorType, MappingProxyType from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload -import mlflow -import nltk import numpy as np import pandas as pd -import plotly.graph_objects as go import scipy.sparse as sps from beartype.door import is_bearable from IPython.display import display -from matplotlib.colors import to_rgba -from mlflow.models.signature import infer_signature -from optuna.study import Study -from optuna.trial import FrozenTrial from pandas._libs.missing import NAType from pandas._typing import Axes, Dtype from pandas.api.types import is_numeric_dtype -from shap import Explainer, Explanation from sklearn.base import BaseEstimator from sklearn.base import OneToOneFeatureMixin as FMixin from sklearn.metrics import ( @@ -53,15 +46,19 @@ Bool, Estimator, FeatureNamesOut, Float, IndexSelector, Int, IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar, Scorer, Segment, Sequence, SPTuple, Transformer, TReturn, TReturns, - Verbose, XConstructor, XSelector, YConstructor, YSelector, int_t, - segment_t, sequence_t, + Verbose, XConstructor, XSelector, YConstructor, int_t, segment_t, + sequence_t, EngineTuple ) if TYPE_CHECKING: + from optuna.study import Study + from optuna.trial import FrozenTrial + from shap import Explainer, Explanation + from atom.basemodel import BaseModel from atom.baserunner import BaseRunner - from atom.branch import Branch + from atom.data import Branch T = TypeVar("T") @@ -93,7 +90,7 @@ def infer_task(self, y: Pandas) -> Task: Parameters ---------- - y: series or dataframe + y: pd.Series or pd.DataFrame Target column(s). Returns @@ -257,12 +254,12 @@ def get_stratify_columns(self, df: pd.DataFrame, y: Pandas) -> pd.DataFrame | No df: pd.DataFrame Dataset from which to get the columns. - y: series or dataframe + y: pd.Series or pd.DataFrame Target column(s). Returns ------- - dataframe or None + pd.DataFrame or None Dataset with subselection of columns. Returns None if there's no stratification. @@ -634,6 +631,8 @@ def __call__(self, study: Study, trial: FrozenTrial): # Save trials to mlflow experiment as nested runs if self.T.experiment and self.T.log_ht: + import mlflow + with mlflow.start_run(run_id=self.T.run.info.run_id): run_name = f"{self.T.name} - {trial.number}" with mlflow.start_run(run_name=run_name, nested=True): @@ -660,7 +659,7 @@ def __call__(self, study: Study, trial: FrozenTrial): mlflow.sklearn.log_model( sk_model=estimator, artifact_path=estimator.__class__.__name__, - signature=infer_signature( + signature=mlflow.models.signature.infer_signature( model_input=pd.DataFrame(self.T.branch.X), model_output=estimator.predict(self.T.branch.X.iloc[[0]]), ), @@ -733,10 +732,12 @@ class PlotCallback: max_len = 15 # Maximum trials to show at once in the plot def __init__(self, name: str, metric: list[str], aesthetics: Aesthetics): + import plotly.graph_objects as go + self.y1: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))} self.y2: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))} - traces: list[go.Scatter] = [] + traces = [] colors = cycle(aesthetics.palette) for met in metric: color = next(colors) @@ -918,16 +919,18 @@ def explainer(self) -> Explainer: Returns ------- - Explainer + shap.Explainer Get the initialized explainer object. """ - # Pass masker as np.array and feature names separately for modin frames + from shap import Explainer + kwargs = { - "masker": self.branch.X_train.to_numpy(), + "masker": self.branch.X_train, "feature_names": list(self.branch.features), "seed": self.random_state, } + try: # Fails when model does not fit standard explainers (e.g., ensembles) return Explainer(self.estimator, **kwargs) except TypeError: @@ -1281,6 +1284,8 @@ def to_rgb(c: str) -> str: Color's RGB representation. """ + from matplotlib.colors import to_rgba + if not c.startswith("rgb"): colors = to_rgba(c)[:3] return f"rgb({colors[0]}, {colors[1]}, {colors[2]})" @@ -1317,7 +1322,7 @@ def merge(*args) -> pd.DataFrame: Returns ------- - dataframe + pd.DataFrame Concatenated dataframe. """ @@ -1335,7 +1340,7 @@ def replace_missing(X: T_Pandas, missing_values: list[Any] | None = None) -> T_P Parameters ---------- - X: series or dataframe + X: pd.Series or pd.DataFrame Data set to replace. missing_values: list or None, default=None @@ -1344,7 +1349,7 @@ def replace_missing(X: T_Pandas, missing_values: list[Any] | None = None) -> T_P Returns ------- - series or dataframe + pd.Series or pd.DataFrame Data set without missing values. """ @@ -1380,7 +1385,7 @@ def get_nan(dtype: Dtype) -> float | NAType: ) -def n_cols(obj: XSelector | YSelector) -> int: +def n_cols(obj: XConstructor | YConstructor) -> int: """Get the number of columns in a dataset. Parameters @@ -1414,12 +1419,12 @@ def get_cols(obj: Pandas) -> list[pd.Series]: Parameters ---------- - obj: series or dataframe + obj: pd.Series or pd.DataFrame Element to get the columns from. Returns ------- - list of series + list of pd.Series Columns. """ @@ -1447,15 +1452,15 @@ def get_col_names(obj: Any) -> list[str] | None: if isinstance(obj, pd.DataFrame): return list(obj.columns) elif isinstance(obj, pd.Series): - return [obj.name] + return [str(obj.name)] else: return None def variable_return( X: pd.DataFrame | None, - y: pd.Series | None, -) -> pd.DataFrame | pd.Series | tuple[pd.DataFrame, Pandas]: + y: Pandas | None, +) -> Pandas | tuple[pd.DataFrame, Pandas]: """Return one or two arguments depending on which is None. This utility is used to make methods return only the provided @@ -1475,12 +1480,14 @@ def variable_return( Data sets that are not None. """ - if y is None: + if y is None and X is not None: return X - elif X is None: + elif X is None and y is not None: return y - else: + elif X is not None and y is not None: return X, y + else: + raise ValueError("Both X and y can't be None.") def get_segment(obj: list[T], segment: Segment) -> list[T]: @@ -1513,7 +1520,7 @@ def is_sparse(obj: Pandas) -> bool: Parameters ---------- - obj: series or dataframe + obj: pd.Series or pd.DataFrame Data set to check. Returns @@ -1525,48 +1532,40 @@ def is_sparse(obj: Pandas) -> bool: return any(isinstance(col.dtype, pd.SparseDtype) for col in get_cols(obj)) -def check_empty(obj: Pandas) -> Pandas | None: +def check_empty(obj: Pandas | None) -> Pandas | None: """Check if a pandas object is empty. Parameters ---------- - obj: series or dataframe + obj: pd.Series, pd.DataFrame or None Pandas object to check. Returns ------- - series, dataframe or None - Same object or None if empty. + pd.Series, pd.DataFrame or None + Same object or None if empty or obj is None. """ return obj if isinstance(obj, pd.DataFrame) and not obj.empty else None -def check_dependency(name: str) -> ModuleType: +def check_dependency(name: str): """Check an optional dependency. - Import the module or raise an error if the package is not - installed. + Raise an error if the package is not installed. Parameters ---------- name: str Name of the package to check. - Returns - ------- - module - Imported module. - """ - try: - return import_module(name) - except ModuleNotFoundError as ex: + if not find_spec(name): raise ModuleNotFoundError( f"Unable to import the {name} package. Install it using " f"`pip install {name}` or install all of atom's optional " "dependencies with `pip install atom-ml[full]`." - ) from ex + ) def check_nltk_module(module: str, *, quiet: bool): @@ -1583,6 +1582,8 @@ def check_nltk_module(module: str, *, quiet: bool): Whether to show logs when downloading. """ + import nltk + try: nltk.data.find(module) except LookupError: @@ -1628,7 +1629,7 @@ def check_predict_proba(models: Model | Sequence[Model], method: str): ) -def check_scaling(X: Pandas) -> bool: +def check_scaling(obj: Pandas) -> bool: """Check if the data is scaled. A data set is considered scaled when the mean of the mean of @@ -1638,7 +1639,7 @@ def check_scaling(X: Pandas) -> bool: Parameters ---------- - X: series or dataframe + obj: pd.Series or pd.DataFrame Data set to check. Returns @@ -1647,15 +1648,19 @@ def check_scaling(X: Pandas) -> bool: Whether the data set is scaled. """ - df = to_df(X) - mean = df.mean(numeric_only=True).mean() - std = df.std(numeric_only=True).mean() + if isinstance(obj, pd.DataFrame): + mean = obj.mean(numeric_only=True).mean() + std = obj.std(numeric_only=True).mean() + else: + mean = obj.mean() + std = obj.std() + return bool(-0.05 < mean < 0.05 and 0.85 < std < 1.15) @contextmanager def keep_attrs(estimator: Estimator): - """Contextmanager to save an estimator's custom attributes. + """Temporarily save an estimator's custom attributes. ATOM's pipeline uses two custom attributes for its transformers: _train_only, and _cols. Since some transformers reset their @@ -1675,30 +1680,42 @@ def keep_attrs(estimator: Estimator): @contextmanager -def adjust_verbosity(estimator: Estimator, verbose: Verbose | None): - """Contextmanager to save an estimator's custom attributes. +def adjust( + estimator: Estimator, + *, + transform: EngineDataOptions | None = None, + verbose: Verbose | None = None, +): + """Temporarily adjust output parameters of an estimator. - ATOM's pipeline uses two custom attributes for its transformers: - _train_only, and _cols. Since some transformers reset their - attributes during fit (like those from sktime), we wrap the fit - method in a contextmanager that saves and restores the attrs. + The estimator's data engine and verbosity are temporarily changed + to the provided values. Parameters ---------- estimator: Estimator Temporarily change the verbosity of this estimator. + transform: str or None, default=None + Data engine for the estimator. If None, it leaves it to + its original engine. + verbose: int or None, default=None - Verbosity level of the transformers in the pipeline. If - None, it leaves them to their original verbosity. + Verbosity level for the estimator. If None, it leaves it to + its original verbosity. """ try: + if transform is not None and hasattr(estimator, "set_output"): + output = getattr(estimator, "_engine", EngineTuple()) + estimator.set_output(transform=transform) if verbose is not None and hasattr(estimator, "verbose"): verbosity = estimator.verbose estimator.verbose = verbose yield estimator finally: + if transform is not None and hasattr(estimator, "set_output"): + estimator._engine = output if verbose is not None and hasattr(estimator, "verbose"): estimator.verbose = verbosity @@ -1791,14 +1808,14 @@ def to_df( @overload def to_df( - data: XSelector, - index: Axes | None = ..., - columns: Axes | None = ..., + data: XConstructor, + index: Axes | None, + columns: Axes | None, ) -> pd.DataFrame: ... def to_df( - data: XSelector | None, + data: XConstructor | None, index: Axes | None = None, columns: Axes | None = None, ) -> pd.DataFrame | None: @@ -1810,20 +1827,22 @@ def to_df( Dataset to convert to a dataframe. If None or already a pandas dataframe, return unchanged. - index: sequence, index or None, default=None + index: sequence or None, default=None Values for the index. columns: sequence or None, default=None - Name of the columns. Use None for automatic naming. + Names of the columns. Use None for automatic naming. Returns ------- - dataframe or None - Dataset as dataframe. + pd.DataFrame or None + Data as dataframe. Returns None if data is None. """ - if not isinstance(data, pd.DataFrame | None): - if hasattr(data, "to_pandas"): + if data is not None: + if isinstance(data, pd.DataFrame): + data_c = data.copy() + elif hasattr(data, "to_pandas"): data_c = data.to_pandas() elif hasattr(data, "__dataframe__"): # Transform from dataframe interchange protocol @@ -1834,13 +1853,14 @@ def to_df( columns = [f"x{i}" for i in range(n_cols(data))] if sps.issparse(data): - data_c = pd.DataFrame.sparse.from_spmatrix(data, index, columns) + data_c = pd.DataFrame.sparse.from_spmatrix( + data=data, + index=index, + columns=columns, + ) else: - data_c = pd.DataFrame(data, index, columns) - else: - data_c = data + data_c = pd.DataFrame(data, index=index, columns=columns, copy=True) - if data_c is not None: # If text dataset, change the name of the column to corpus if list(data_c.columns) == ["x0"] and data_c.dtypes[0].name in CAT_TYPES: data_c = data_c.rename(columns={data_c.columns[0]: "corpus"}) @@ -1862,7 +1882,9 @@ def to_df( f"{set(data_c.columns) - set(columns)} are missing in X." ) from None - return data_c + return data_c + else: + return None @overload @@ -1875,14 +1897,14 @@ def to_series( @overload def to_series( - data: dict[str, Any] | Sequence[Any], + data: dict[str, Any] | Sequence[Any] | pd.DataFrame, index: Axes | None = ..., name: str | None = ..., ) -> pd.Series: ... def to_series( - data: dict[str, Any] | Sequence[Any] | None, + data: dict[str, Any] | Sequence[Any] | pd.DataFrame | None, index: Axes | None = None, name: str | None = None, ) -> pd.Series | None: @@ -1890,7 +1912,7 @@ def to_series( Parameters ---------- - data: dict, sequence or None + data: dict, sequence, pd.DataFrame or None Data to convert. If None or already a pandas series, return unchanged. @@ -1902,12 +1924,16 @@ def to_series( Returns ------- - series or None - Sequence as series of a type given by the backend. + pd.Series or None + Data as series. Returns None if data is None. """ - if not isinstance(data, pd.Series | None): - if hasattr(data, "to_pandas"): + if data is not None: + if isinstance(data, pd.Series): + data_c = data.copy() + elif isinstance(data, pd.DataFrame): + data_c = data.iloc[:, 0].copy() + elif hasattr(data, "to_pandas"): data_c = data.to_pandas() else: try: @@ -1917,15 +1943,11 @@ def to_series( # Fails for inhomogeneous data array = data - data_c = pd.Series( - data=array, - index=index, - name=name or "target", - ) - else: - data_c = data + data_c = pd.Series(array, index=index, name=name or "target", copy=True) - return data_c + return data_c + else: + return None @overload @@ -1956,7 +1978,7 @@ def to_tabular( Parameters ---------- - data: dict, sequence, dataframe or None + data: dict, sequence, pd.DataFrame or None Data to convert. If None, return unchanged. index: sequence, index or None, default=None @@ -1967,8 +1989,8 @@ def to_tabular( Returns ------- - series, dataframe or None - Data as a Pandas object. + pd.Series, pd.DataFrame or None + Data as a pandas object. """ if (n_targets := n_cols(data)) == 1: @@ -2134,7 +2156,7 @@ def name_cols( Parameters ---------- - array: np.ndarray, sps.matrix, series or dataframe + array: np.ndarray, sps.matrix, pd.Series or pd.DataFrame Transformed dataset. original_df: pd.DataFrame @@ -2253,7 +2275,7 @@ def reorder_cols( Returns ------- - dataframe + pd.DataFrame Dataset with reordered columns. """ @@ -2309,7 +2331,7 @@ def fit_one( Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: dict, sequence, dataframe-like or None, default=None + y: sequence, pd.DataFrame-like or None, default=None Target column(s) corresponding to `X`. message: str or None @@ -2325,7 +2347,7 @@ def fit_one( """ Xt = to_df(X) - yt = to_tabular(y, index=Xt.index) + yt = to_tabular(y, index=getattr(Xt, "index", None)) with _print_elapsed_time("Pipeline", message): if hasattr(estimator, "fit"): @@ -2384,7 +2406,7 @@ def transform_one( Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: dict, sequence, dataframe-like or None, default=None + y: sequence, pd.DataFrame-like or None, default=None Target column(s) corresponding to `X`. method: str, default="transform" @@ -2395,10 +2417,10 @@ def transform_one( Returns ------- - dataframe or None + pd.DataFrame or None Feature set. Returns None if not provided. - series, dataframe or None + pd.Series, pd.DataFrame or None Target column(s). Returns None if not provided. """ @@ -2408,7 +2430,7 @@ def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: Parameters ---------- - out: np.ndarray, sps.matrix or dataframe + out: np.ndarray, sps.matrix or pd.DataFrame Data returned by the transformation. og: pd.DataFrame @@ -2440,7 +2462,7 @@ def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: return out Xt = to_df(X) - yt = to_tabular(y, index=Xt.index) + yt = to_tabular(y, index=getattr(Xt, "index", None)) use_y = True @@ -2508,7 +2530,7 @@ def fit_transform_one( Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: dict, sequence, dataframe-like or None + y: sequence, pd.DataFrame-like or None Target column(s) corresponding to `X`. message: str or None, default=None @@ -2667,6 +2689,7 @@ def wrapper(*args, **kwargs) -> Any: return wrapper + def make_sklearn( obj: T_Estimator, feature_names_out: FeatureNamesOut = "one-to-one", @@ -2734,10 +2757,10 @@ def wrapper(self, *args, **kwargs): return wrapper - if not obj.__module__.startswith(("atom.", "sklearn.", "imblearn.")) and hasattr(obj, "fit"): - if isinstance(obj, type): + if not obj.__module__.startswith(("atom.", "sklearn.", "imblearn.")): + if isinstance(obj, type) and hasattr(obj, "fit"): obj.fit = wrap_fit(obj.fit) - else: + elif hasattr(obj.__class__, "fit"): obj.fit = wrap_fit(obj.__class__.fit).__get__(obj) # type: ignore[method-assign] return obj diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md index 97dc68b4a..f464f3795 100644 --- a/docs_sources/dependencies.md +++ b/docs_sources/dependencies.md @@ -26,31 +26,24 @@ packages are necessary for its correct functioning. * **[beartype](https://beartype.readthedocs.io/en/latest/)** (>=0.16.4) * **[category-encoders](https://contrib.scikit-learn.org/categorical-encoding/index.html)** (>=2.6.3) -* **[dagshub](https://github.com/DagsHub/client)** (>=0.3.8) * **[dill](https://pypi.org/project/dill/)** (>=0.3.6) +* **[featuretools](https://www.featuretools.com/)** (>=1.28.0) * **[gplearn](https://gplearn.readthedocs.io/en/stable/index.html)** (>=0.4.2) * **[imbalanced-learn](https://imbalanced-learn.readthedocs.io/en/stable/api.html)** (>=0.12.0) * **[ipython](https://ipython.readthedocs.io/en/stable/)** (>=8.11.0) * **[ipywidgets](https://pypi.org/project/ipywidgets/)** (>=8.1.1) -* **[featuretools](https://www.featuretools.com/)** (>=1.28.0) * **[joblib](https://joblib.readthedocs.io/en/latest/)** (>=1.3.1) * **[matplotlib](https://matplotlib.org/)** (>=3.7.2) * **[mlflow](https://mlflow.org/)** (>=2.7.1) -* **[modin[ray]](https://modin.readthedocs.io/en/stable/)** (>=0.25.0) * **[nltk](https://www.nltk.org/)** (>=3.8.1) * **[numpy](https://numpy.org/)** (>=1.23.0) * **[optuna](https://optuna.org/)** (>=3.4.0) * **[pandas](https://pandas.pydata.org/)** (>=2.1.2) * **[plotly](https://plotly.com/python/)** (>=5.18.0) -* **[polars](https://pola.rs/)** (>=0.20.7) -* **[pyarrow](https://arrow.apache.org/docs/python/)** (>=15.0.0) -* **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.9.1) -* **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0) * **[scikit-learn](https://scikit-learn.org/stable/)** (>=1.4.0) -* **[scikit-learn-intelex](https://github.com/intel/scikit-learn-intelex)** (>=2023.2.1) * **[scipy](https://www.scipy.org/)** (>=1.10.1) * **[shap](https://github.com/slundberg/shap/)** (>=0.43.0) -* **[sktime[forecasting]](http://www.sktime.net/en/latest/)** (>=0.24.0) +* **[sktime[forecasting]](http://www.sktime.net/en/latest/)** (>=0.26.0) * **[statsmodels](https://www.statsmodels.org/stable/index.html)** (>=0.14.1) * **[zoofs](https://jaswinder9051998.github.io/zoofs/)** (>=0.1.26) @@ -63,9 +56,19 @@ additional libraries. You can install all the optional dependencies using * **[botorch](https://botorch.org/docs/introduction)** (>=0.8.5) * **[catboost](https://catboost.ai/docs/concepts/about.html)** (>=1.2) +* **[dagshub](https://github.com/DagsHub/client)** (>=0.3.8) +* **[dask](https://dask.org/)** (>=2024.2.0) * **[explainerdashboard](https://explainerdashboard.readthedocs.io/en/latest/)** (>=0.4.3) * **[gradio](https://github.com/gradio-app/gradio)** (>=3.44.4) * **[lightgbm](https://lightgbm.readthedocs.io/en/latest/)** (>=4.1.0) +* **[modin[ray]](https://modin.readthedocs.io/en/stable/)** (>=0.25.0) +* **[polars](https://pola.rs/)** (>=0.20.7) +* **[pyarrow](https://arrow.apache.org/docs/python/)** (>=15.0.0) +* **[pyspark](https://github.com/apache/spark/tree/master/python)** (>=3.5.0) +* **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0) +* **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.9.1) +* **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0) +* **[scikit-learn-intelex](https://github.com/intel/scikit-learn-intelex)** (>=2023.2.1) * **[schemdraw](https://schemdraw.readthedocs.io/en/latest/index.html)** (>=0.16) * **[statsforecast](https://github.com/Nixtla/statsforecast/)** (>=1.6.0) * **[sweetviz](https://github.com/fbdesignpro/sweetviz)** (>=2.3.1) @@ -87,9 +90,7 @@ running `pdm install --dev` (remember to install [pdm](https://pdm-project.org/l * **[mypy](https://www.mypy-lang.org/)** (>=1.6.1) * **[pandas_stubs](https://pypi.org/project/pandas-stubs/)** (>=2.1.1.230928) * **[pre-commit](https://pre-commit.com/)** (>=3.5.0) -* **[pyspark-stubs](https://github.com/zero323/pyspark-stubs)** (>=3.0.0) * **[ruff](https://docs.astral.sh/ruff/)** (>=0.1.7) -* **[types-requests](https://github.com/python/typeshed)** (>=2.31.0.10) **Testing** diff --git a/pyproject.toml b/pyproject.toml index 1948432e5..ae2d50a20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,31 +21,24 @@ classifiers = [ dependencies = [ "beartype>=0.16.4", "category-encoders>=2.6.3", - "dagshub>=0.3.8", "dill>=0.3.6", + "featuretools>=1.28.0", "gplearn>=0.4.2", "imbalanced-learn>=0.12.0", "ipython>=8.11.0", "ipywidgets>=8.1.1", - "featuretools>=1.28.0", "joblib>=1.3.1", "matplotlib>=3.7.2", "mlflow>=2.7.1", - "modin[ray]>=0.25.0", "nltk>=3.8.1", "numpy>=1.23.0", "optuna>=3.4.0", "pandas>=2.1.2", "plotly>=5.18.0", - "polars>=0.20.7", - "pyarrow>=15.0.0", - "ray[serve]>=2.9.1", - "requests>=2.31.0", "scikit-learn>=1.4.0", - "scikit-learn-intelex>=2023.2.1; platform_machine == 'x86_64' or platform_machine == 'AMD64'", "scipy>=1.10.1", "shap>=0.43.0", - "sktime[forecasting]>=0.24.0", + "sktime[forecasting]>=0.26.0", "statsmodels>=0.14.1", "zoofs>=0.1.26", ] @@ -54,9 +47,18 @@ dependencies = [ full = [ "botorch>=0.8.5", "catboost>=1.2", + "dagshub>=0.3.8", + "dask>=2024.2.0", "explainerdashboard>=0.4.3", "gradio>=3.44.4", "lightgbm>=4.1.0", + "modin[ray]>=0.25.0", + "polars>=0.20.7", + "pyarrow>=15.0.0", + "pyspark>=3.5.0", + "ray[serve]>=2.9.1", + "requests>=2.31.0", + "scikit-learn-intelex>=2023.2.1; platform_machine == 'x86_64' or platform_machine == 'AMD64'", "schemdraw>=0.16", "statsforecast>=1.6.0", "sweetviz>=2.3.1", @@ -71,9 +73,7 @@ dev = [ "mypy>=1.6.1", "pandas_stubs>=2.1.1.230928", "pre-commit>=3.5.0", - "pyspark-stubs>=3.0.0", "ruff>=0.1.7", - "types-requests>=2.31.0.10", # Testing "nbmake>=1.4.1", # To test example notebooks "pytest>=7.2.1", @@ -190,4 +190,7 @@ convention = "numpy" [tool.mypy] ignore_missing_imports = true -disable_error_code = ["attr-defined"] +disable_error_code = [ + "attr-defined", + "abstract", # See https://github.com/python/mypy/issues/4717 +] diff --git a/tests/test_atom.py b/tests/test_atom.py index 9c4689d6a..dcc430e5c 100644 --- a/tests/test_atom.py +++ b/tests/test_atom.py @@ -22,6 +22,7 @@ from sklearn.preprocessing import ( LabelEncoder, MultiLabelBinarizer, OneHotEncoder, StandardScaler, ) +import pyarrow as pa from sktime.transformations.series.impute import Imputer from sktime.transformations.series.summarize import WindowSummarizer @@ -313,6 +314,13 @@ def test_inverse_transform(): assert_frame_equal(atom.inverse_transform(atom.X), X_bin) +def test_inverse_transform_output(): + """Assert that the output type is determined by the data engine.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pyarrow", random_state=1) + atom.scale() + assert isinstance(atom.inverse_transform(X_bin), pa.Table) + + def test_load_no_atom(): """Assert that an error is raised when the instance is not atom.""" trainer = DirectClassifier("LR", random_state=1) @@ -488,6 +496,13 @@ def test_transform_not_train_only(): assert len(atom.transform(X_bin)) == len(X_bin) +def test_transform_output(): + """Assert that the output type is determined by the data engine.""" + atom = ATOMClassifier(X_bin, y_bin, engine="pyarrow", random_state=1) + atom.scale() + assert isinstance(atom.transform(X_bin), pa.Table) + + # Test base transformers =========================================== >> def test_add_after_model(): diff --git a/tests/test_basemodel.py b/tests/test_basemodel.py index db2b3009f..beba798ca 100644 --- a/tests/test_basemodel.py +++ b/tests/test_basemodel.py @@ -12,6 +12,7 @@ import pandas as pd import pytest import requests +import polars as pl from optuna.distributions import CategoricalDistribution, IntDistribution from optuna.pruners import PatientPruner from optuna.samplers import NSGAIISampler @@ -871,6 +872,13 @@ def test_inverse_transform(): assert_frame_equal(atom.lr.inverse_transform(atom.lr.X), X_bin) +def test_inverse_transform_output(): + """Assert that the output type is determined by the data engine.""" + atom = ATOMClassifier(X_bin, y_bin, engine="polars", random_state=1) + atom.run("Tree") + assert isinstance(atom.tree.inverse_transform(X_bin), pl.DataFrame) + + def test_save_estimator(): """Assert that the save_estimator saves a pickle file.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) @@ -918,6 +926,13 @@ def test_transform(): assert all(-3 <= v <= 3 for v in X.to_numpy().ravel()) # Data is scaled +def test_transform_output(): + """Assert that the output type is determined by the data engine.""" + atom = ATOMClassifier(X_bin, y_bin, engine="polars", random_state=1) + atom.run("Tree") + assert isinstance(atom.tree.transform(X_bin), pl.DataFrame) + + # Test ClassRegModel ================================================== >> def test_classreg_get_tags(): diff --git a/tests/test_baserunner.py b/tests/test_baserunner.py index 13b45e18b..01160c2b1 100644 --- a/tests/test_baserunner.py +++ b/tests/test_baserunner.py @@ -18,7 +18,7 @@ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from atom import ATOMClassifier, ATOMForecaster, ATOMRegressor -from atom.branch import Branch +from atom.data import Branch from atom.training import DirectClassifier, DirectForecaster from atom.utils.types import SPTuple from atom.utils.utils import NotFittedError, merge diff --git a/tests/test_basetrainer.py b/tests/test_basetrainer.py index 5e3ffd6e4..26f1af366 100644 --- a/tests/test_basetrainer.py +++ b/tests/test_basetrainer.py @@ -375,8 +375,8 @@ def test_errors_keep(): assert trainer._models == [trainer.lda] -@patch("atom.basetransformer.ray", MagicMock()) -@patch("atom.basetrainer.ray", MagicMock()) +# @patch("atom.basetransformer.ray", MagicMock()) +# @patch("atom.basetrainer.ray", MagicMock()) def test_parallel_with_ray(): """Assert that parallel runs successfully with ray backend.""" trainer = DirectClassifier( diff --git a/tests/test_basetransformer.py b/tests/test_basetransformer.py index 987d23275..9357abdb9 100644 --- a/tests/test_basetransformer.py +++ b/tests/test_basetransformer.py @@ -64,13 +64,6 @@ def test_engine_parameter(engine): assert base.engine == EngineTuple() -@patch.dict("sys.modules", {"sklearnex": None}) -def test_engine_parameter_no_sklearnex(): - """Assert that an error is raised when sklearnex is not installed.""" - with pytest.raises(ModuleNotFoundError, match=".*import scikit-learn-intelex.*"): - BaseTransformer(device="cpu", engine={"estimator": "sklearnex"}) - - @pytest.mark.skipif(machine() not in ("x86_64", "AMD64"), reason="Only x86 support") def test_engine_parameter_sklearnex(): """Assert that sklearnex offloads to the right device.""" diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 586c2a737..fa9b948af 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pytest +import pyarrow as pa from pandas.testing import assert_frame_equal from sklearn.preprocessing import LabelEncoder, StandardScaler from sktime.proba.normal import Normal @@ -216,6 +217,18 @@ def test_predict_var(pipeline_ts): assert isinstance(pipeline_ts.predict_var(fh=range(3)), pd.DataFrame) +def test_set_output(pipeline): + """Assert that the set_output method determines the data engine.""" + pl = pipeline(model=False) + assert isinstance(pl.transform(X_bin), pd.DataFrame) + + pl.set_output(transform="numpy") + assert isinstance(pl.fit_transform(X_bin, y_bin)[0], np.ndarray) + + pl.set_output(transform="pyarrow") + assert isinstance(pl.inverse_transform(X_bin), pa.Table) + + def test_score_no_parameters(pipeline_ts): """Assert that an error is raised when X and fh are both None.""" with pytest.raises(ValueError, match=".*cannot be both None.*"): diff --git a/tests/test_training.py b/tests/test_training.py index bc3d8f6c9..ce90a0acb 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -36,7 +36,7 @@ def test_models_are_restored(): ) sh.run(reg_train, reg_test) assert "Tree" not in sh._models # The original model is deleted - assert all(m in sh.models for m in ("Tree4", "AdaB2", "LGB1")) + assert all(m in sh.models for m in ("Tree4", "AdaB2", "AdaB1")) def test_ts_int_train_sizes(): diff --git a/tests/test_utils.py b/tests/test_utils.py index 486c94eb3..12f107676 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -6,7 +6,7 @@ """ from datetime import timedelta -from unittest.mock import Mock, patch +from unittest.mock import patch import numpy as np import pytest @@ -18,9 +18,7 @@ from atom import show_versions from atom.pipeline import Pipeline from atom.utils.patches import VotingClassifier, VotingRegressor -from atom.utils.utils import ( - ClassMap, check_is_fitted, time_to_str, to_df, to_series, -) +from atom.utils.utils import ClassMap, check_is_fitted, time_to_str from .conftest import X_bin, X_reg, y_bin, y_reg @@ -158,9 +156,3 @@ def test_time_to_string(): assert time_to_str(timedelta(seconds=17).total_seconds()).startswith("17.00") assert time_to_str(timedelta(minutes=1, seconds=2).total_seconds()) == "01m:02s" assert time_to_str(timedelta(hours=3, minutes=8).total_seconds()) == "03h:08m:00s" - - -def test_to_pandas_with_cuml(): - """Assert that cuML objects use the to_pandas method.""" - to_df(Mock(spec=["to_pandas"]), columns=[0, 1]) - to_series(Mock(spec=["to_pandas"])) From 19588ce9a092c9a58c9708319609a009f1d1e4b3 Mon Sep 17 00:00:00 2001 From: Mavs Date: Tue, 20 Feb 2024 14:38:08 +0100 Subject: [PATCH 07/12] fix mypy in utils --- atom/atom.py | 18 ++--- atom/basemodel.py | 135 ++++++++++++++++++++--------------- atom/basetransformer.py | 3 +- atom/data/dataengines.py | 11 ++- atom/data_cleaning.py | 11 ++- atom/feature_engineering.py | 8 +-- atom/nlp.py | 4 +- atom/pipeline.py | 8 +-- atom/plots/baseplot.py | 4 +- atom/plots/predictionplot.py | 2 +- atom/utils/types.py | 99 +++++++++++++++++++++++-- atom/utils/utils.py | 95 ++++++++++++------------ tests/test_atom.py | 3 +- tests/test_basemodel.py | 2 +- tests/test_pipeline.py | 2 +- 15 files changed, 256 insertions(+), 149 deletions(-) diff --git a/atom/atom.py b/atom/atom.py index 3aedd7ae4..c6852300c 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -53,16 +53,16 @@ FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, IndexSelector, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, MetricConstructor, ModelsConstructor, NItems, NJobs, - NormalizerStrats, NumericalStrats, Operators, Pandas, Predictor, + NormalizerStrats, NumericalStrats, Operators, Predictor, PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality, Sequence, SPDict, TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings, - XSelector, YSelector, sequence_t, + XReturn, XSelector, YReturn, YSelector, sequence_t, ) from atom.utils.utils import ( - ClassMap, DataConfig, DataContainer, Goal, adjust, - check_dependency, composed, crash, fit_one, flt, get_cols, - get_custom_scorer, has_task, is_sparse, lst, make_sklearn, merge, - method_to_log, n_cols, replace_missing, sign, + ClassMap, DataConfig, DataContainer, Goal, adjust, check_dependency, + composed, crash, fit_one, flt, get_cols, get_custom_scorer, has_task, + is_sparse, lst, make_sklearn, merge, method_to_log, n_cols, + replace_missing, sign, ) @@ -163,7 +163,7 @@ def __init__( self._log(f"Data engine: {self.engine.data}", 1) if self.engine.estimator != EngineTuple().estimator: self._log(f"Estimator engine: {self.engine.estimator}", 1) - if self.backend == "ray" or self.n_jobs > 1: + if self.backend != "loky" and self.n_jobs > 1: self._log(f"Parallelization backend: {self.backend}", 1) if self.memory.location is not None: self._log(f"Cache storage: {os.path.join(self.memory.location, 'joblib')}", 1) @@ -672,7 +672,7 @@ def inverse_transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Inversely transform new data through the pipeline. Transformers that are only applied on the training set are @@ -1094,7 +1094,7 @@ def transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Transform new data through the pipeline. Transformers that are only applied on the training set are diff --git a/atom/basemodel.py b/atom/basemodel.py index b2ec92108..4bab6b270 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -21,6 +21,7 @@ import dill as pickle import mlflow import numpy as np +import optuna import pandas as pd from beartype import beartype from joblib.memory import Memory @@ -54,7 +55,7 @@ from sktime.performance_metrics.forecasting import make_forecasting_scorer from sktime.proba.normal import Normal from sktime.split import ExpandingWindowSplitter, SingleWindowSplitter -import optuna + from atom.data import Branch, BranchManager from atom.data_cleaning import Scaler from atom.pipeline import Pipeline @@ -65,15 +66,15 @@ HT, Backend, Bool, Engine, FHConstructor, Float, FloatZeroToOneExc, Int, IntLargerEqualZero, MetricConstructor, MetricFunction, NJobs, Pandas, PredictionMethods, PredictionMethodsTS, Predictor, RowSelector, Scalar, - Scorer, Sequence, Stages, TargetSelector, Verbose, Warnings, XSelector, - YSelector, float_t, int_t, + Scorer, Sequence, Stages, TargetSelector, Verbose, Warnings, XReturn, + XSelector, YReturn, YSelector, float_t, int_t, ) from atom.utils.utils import ( ClassMap, DataConfig, Goal, PlotCallback, ShapExplanation, Task, - TrialsCallback, adjust, cache, check_dependency, check_empty, - composed, crash, estimator_has_attr, flt, get_col_names, get_cols, - get_custom_scorer, has_task, it, lst, merge, method_to_log, rnd, sign, - time_to_str, to_df, to_series, to_tabular, + TrialsCallback, adjust, cache, check_dependency, check_empty, composed, + crash, estimator_has_attr, flt, get_col_names, get_cols, get_custom_scorer, + has_task, it, lst, merge, method_to_log, rnd, sign, time_to_str, to_df, + to_series, to_tabular, ) @@ -2244,7 +2245,7 @@ def inverse_transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Inversely transform new data through the pipeline. Transformers that are only applied on the training set are @@ -2441,7 +2442,7 @@ def transform( y: YSelector | None = None, *, verbose: Verbose | None = None, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Transform new data through the pipeline. Transformers that are only applied on the training set are @@ -2561,7 +2562,7 @@ def _prediction( set with shape=(n_samples, n_features) to make predictions on. - y: int, str, dict, sequence, dataframe or None, default=None + y: int, str, dict, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - If None: `y` is ignored. @@ -2617,10 +2618,15 @@ def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[pd.DataFrame, Pandas] Transformed target column. """ - if isinstance(out := self.transform(X, y, verbose=verbose), tuple): + Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) + + with adjust(self.pipeline, verbose=verbose) as pl: + out = pl.transform(Xt, yt) + + if isinstance(out, tuple): return out else: - return out, y + return out, yt def assign_prediction_columns() -> list[str]: """Assign column names for the prediction methods. @@ -2696,7 +2702,7 @@ def decision_function( X: RowSelector | XSelector, *, verbose: Int | None = None, - ) -> Pandas: + ) -> YReturn: """Get confidence scores on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2725,7 +2731,7 @@ def decision_function( multiclass classification tasks. """ - return self._prediction(X, verbose=verbose, method="decision_function") + return self._convert(self._prediction(X, verbose=verbose, method="decision_function")) @available_if(estimator_has_attr("predict")) @composed(crash, method_to_log, beartype) @@ -2735,7 +2741,7 @@ def predict( *, inverse: Bool = True, verbose: Int | None = None, - ) -> Pandas: + ) -> YReturn: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2773,7 +2779,7 @@ def predict( if inverse: return self.inverse_transform(y=pred) else: - return pred + return self._convert(pred) @available_if(estimator_has_attr("predict_log_proba")) @composed(crash, method_to_log, beartype) @@ -2782,7 +2788,7 @@ def predict_log_proba( X: RowSelector | XSelector, *, verbose: Int | None = None, - ) -> pd.DataFrame: + ) -> XReturn: """Get class log-probabilities on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2804,13 +2810,13 @@ def predict_log_proba( Returns ------- - pd.DataFrame + dataframe Predicted class log-probabilities with shape=(n_samples, n_classes) or shape=(n_samples * n_classes, n_targets) with a multiindex format for [multioutput tasks][]. """ - return self._prediction(X, verbose=verbose, method="predict_log_proba") + return self._convert(self._prediction(X, verbose=verbose, method="predict_log_proba")) @available_if(estimator_has_attr("predict_proba")) @composed(crash, method_to_log, beartype) @@ -2819,7 +2825,7 @@ def predict_proba( X: RowSelector | XSelector, *, verbose: Int | None = None, - ) -> pd.DataFrame: + ) -> XReturn: """Get class probabilities on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2841,13 +2847,13 @@ def predict_proba( Returns ------- - pd.DataFrame + dataframe Predicted class probabilities with shape=(n_samples, n_classes) or shape=(n_samples * n_classes, n_targets) with a multiindex format for [multioutput tasks][]. """ - return self._prediction(X, verbose=verbose, method="predict_proba") + return self._convert(self._prediction(X, verbose=verbose, method="predict_proba")) @available_if(estimator_has_attr("score")) @composed(crash, method_to_log, beartype) @@ -2880,7 +2886,7 @@ def score( set with shape=(n_samples, n_features) to make predictions on. - y: int, str, dict, sequence, dataframe or None, default=None + y: int, str, dict, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - If None: `X` must be a selection of rows in the dataset. @@ -2981,7 +2987,7 @@ def _prediction( verbose: Int | None = None, method: PredictionMethodsTS = "predict", **kwargs, - ) -> Float | Pandas: + ) -> Float | Normal | Pandas: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -2994,7 +3000,7 @@ def _prediction( The [forecasting horizon][row-and-column-selection] encoding the time stamps to forecast at. - y: int, str, dict, sequence, dataframe or None, default=None + y: int, str, dict, sequence, dataframe-like or None, default=None Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None @@ -3018,18 +3024,23 @@ def _prediction( Returns ------- - float, series or dataframe + float, sktime.proba.[Normal][], series or dataframe Calculated predictions. The return type depends on the method called. """ if y is not None or X is not None: - if isinstance(out := self.transform(X, y, verbose=verbose), tuple): + Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) + + with adjust(self.pipeline, verbose=verbose) as pl: + out = pl.transform(Xt, yt) + + if isinstance(out, tuple): Xt, yt = out elif X is not None: - Xt, yt = out, y + Xt, yt = out, yt else: - Xt, yt = X, out + Xt, yt = Xt, out else: Xt, yt = X, y @@ -3057,7 +3068,7 @@ def predict( *, inverse: Bool = True, verbose: Int | None = None, - ) -> Pandas: + ) -> YReturn: """Get predictions on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3097,7 +3108,7 @@ def predict( if inverse: return self.inverse_transform(y=pred) else: - return pred + return self._convert(pred) @available_if(estimator_has_attr("predict_interval")) @composed(crash, method_to_log, beartype) @@ -3108,7 +3119,7 @@ def predict_interval( *, coverage: Float | Sequence[Float] = 0.9, verbose: Int | None = None, - ) -> pd.DataFrame: + ) -> XReturn: """Get prediction intervals on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3135,16 +3146,18 @@ def predict_interval( Returns ------- - pd.DataFrame + dataframe Computed interval forecasts. """ - return self._prediction( - fh=fh, - X=X, - coverage=coverage, - verbose=verbose, - method="predict_interval", + return self._convert( + self._prediction( + fh=fh, + X=X, + coverage=coverage, + verbose=verbose, + method="predict_interval", + ) ) @available_if(estimator_has_attr("predict_proba")) @@ -3204,7 +3217,7 @@ def predict_quantiles( *, alpha: Float | Sequence[Float] = (0.05, 0.95), verbose: Int | None = None, - ) -> pd.DataFrame: + ) -> XReturn: """Get quantile forecasts on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3232,16 +3245,18 @@ def predict_quantiles( Returns ------- - pd.DataFrame + dataframe Computed quantile forecasts. """ - return self._prediction( - fh=fh, - X=X, - alpha=alpha, - verbose=verbose, - method="predict_quantiles", + return self._convert( + self._prediction( + fh=fh, + X=X, + alpha=alpha, + verbose=verbose, + method="predict_quantiles", + ) ) @available_if(estimator_has_attr("predict_residuals")) @@ -3252,7 +3267,7 @@ def predict_residuals( X: XSelector | None = None, *, verbose: Int | None = None, - ) -> Pandas: + ) -> YReturn: """Get residuals of forecasts on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3280,7 +3295,9 @@ def predict_residuals( n_targets) for [multivariate][] tasks. """ - return self._prediction(y=y, X=X, verbose=verbose, method="predict_residuals") + return self._convert( + self._prediction(y=y, X=X, verbose=verbose, method="predict_residuals") + ) @available_if(estimator_has_attr("predict_var")) @composed(crash, method_to_log, beartype) @@ -3291,7 +3308,7 @@ def predict_var( *, cov: Bool = False, verbose: Int | None = None, - ) -> pd.DataFrame: + ) -> XReturn: """Get variance forecasts on new data or existing rows. New data is first transformed through the model's pipeline. @@ -3319,16 +3336,18 @@ def predict_var( Returns ------- - pd.DataFrame + dataframe Computed variance forecasts. """ - return self._prediction( - fh=fh, - X=X, - cov=cov, - verbose=verbose, - method="predict_var", + return self._convert( + self._prediction( + fh=fh, + X=X, + cov=cov, + verbose=verbose, + method="predict_var", + ) ) @available_if(estimator_has_attr("score")) @@ -3357,7 +3376,7 @@ def score( Parameters ---------- - y: int, str, dict, sequence or dataframe + y: int, str, dict, sequence or dataframe-like Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None diff --git a/atom/basetransformer.py b/atom/basetransformer.py index 2586332c7..ea648bdf9 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -32,7 +32,8 @@ from atom.utils.types import ( Backend, Bool, Engine, EngineDataOptions, EngineEstimatorOptions, EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Pandas, - Sequence, Severity, Verbose, Warnings, XSelector, YSelector, bool_t, int_t, + Sequence, Severity, Verbose, Warnings, XSelector, YSelector, + bool_t, int_t, ) from atom.utils.utils import ( check_dependency, crash, lst, make_sklearn, to_df, to_tabular, diff --git a/atom/data/dataengines.py b/atom/data/dataengines.py index 113163f2c..5bb0e080f 100644 --- a/atom/data/dataengines.py +++ b/atom/data/dataengines.py @@ -7,7 +7,6 @@ from __future__ import annotations -import os from abc import ABCMeta, abstractmethod from typing import TYPE_CHECKING @@ -25,10 +24,6 @@ import pyspark.pandas as ps -# Avoid warning about pyarrow timezones not set -os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" - - class DataEngine(metaclass=ABCMeta): """Abstract class for data engines. @@ -41,7 +36,6 @@ class DataEngine(metaclass=ABCMeta): @abstractmethod def convert(obj: Pandas) -> Any: """Convert to data engine output types.""" - pass class NumpyEngine(DataEngine): @@ -71,6 +65,11 @@ class PandasPyarrowEngine(DataEngine): library = "pandas" + def _to_numpy_dtype(self, dtype: np.dtype) -> pa.DataType: + """Convert numpy dtype to pyarrow dtype.""" + if isinstance(dtype, np.dtype): + return pa.from_numpy_dtype(dtype) # TODO: Handle numpy nullable types + @staticmethod def convert(obj: Pandas) -> Pandas: """Convert to pyarrow dtypes.""" diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index fde80861e..282a8dd91 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -46,12 +46,11 @@ from atom.basetransformer import BaseTransformer from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING from atom.utils.types import ( - Bins, Bool, CategoricalStrats, DiscretizerStrats, Engine, - EngineDataOptions, EngineTuple, Estimator, FloatLargerZero, Int, - IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, - NumericalStrats, Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, - SeasonalityModels, Sequence, Transformer, Verbose, XConstructor, - YConstructor, sequence_t, EngineEstimatorOptions + Bins, Bool, CategoricalStrats, DiscretizerStrats, EngineDataOptions, EngineEstimatorOptions, EngineTuple, Estimator, + FloatLargerZero, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, + NJobs, NormalizerStrats, NumericalStrats, Pandas, Predictor, PrunerStrats, + Scalar, ScalerStrats, SeasonalityModels, Sequence, Transformer, Verbose, + XConstructor, YConstructor, sequence_t, ) from atom.utils.utils import ( Goal, check_is_fitted, get_col_names, get_col_order, get_cols, it, lst, diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index 8a5b34b39..d06c39a5b 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -33,10 +33,10 @@ from atom.basetransformer import BaseTransformer from atom.data_cleaning import Scaler, TransformerMixin from atom.utils.types import ( - Bool, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, - FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, - IntLargerEqualZero, IntLargerZero, NJobs, Operators, Scalar, Sequence, - Verbose, XConstructor, YConstructor, EngineEstimatorOptions + Bool, EngineEstimatorOptions, FeatureSelectionSolvers, + FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, + FloatZeroToOneInc, IntLargerEqualZero, IntLargerZero, NJobs, Operators, + Scalar, Sequence, Verbose, XConstructor, YConstructor, ) from atom.utils.utils import ( Goal, Task, check_is_fitted, check_scaling, get_custom_scorer, is_sparse, diff --git a/atom/nlp.py b/atom/nlp.py index 221ca7029..85154552b 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -21,8 +21,8 @@ from atom.data_cleaning import TransformerMixin from atom.utils.types import ( - Bool, Engine, FloatLargerZero, Sequence, VectorizerStarts, Verbose, - XConstructor, YConstructor, bool_t, EngineEstimatorOptions + Bool, EngineEstimatorOptions, FloatLargerZero, Sequence, + VectorizerStarts, Verbose, XConstructor, YConstructor, bool_t, ) from atom.utils.utils import ( check_is_fitted, check_nltk_module, get_corpus, is_sparse, merge, to_df, diff --git a/atom/pipeline.py b/atom/pipeline.py index 0ef061b19..5af808dda 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -27,11 +27,11 @@ from atom.utils.types import ( Bool, EngineDataOptions, EngineTuple, Estimator, FHConstructor, Float, - Pandas, Scalar, Sequence, Verbose, XConstructor, YConstructor, + Pandas, Scalar, Sequence, Verbose, XConstructor, YConstructor, YReturn, ) from atom.utils.utils import ( - NotFittedError, adjust, check_is_fitted, fit_one, - fit_transform_one, transform_one, variable_return, + NotFittedError, adjust, check_is_fitted, fit_one, fit_transform_one, + transform_one, variable_return, ) @@ -226,7 +226,7 @@ def _can_inverse_transform(self) -> bool: for _, _, est in self._iter() ) - def _convert(self, obj: Pandas | None) -> Any: + def _convert(self, obj: Pandas | None) -> YReturn | None: """Convert data to the type set in the data engine. Parameters diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py index 3764e517a..d23b16e82 100644 --- a/atom/plots/baseplot.py +++ b/atom/plots/baseplot.py @@ -27,8 +27,8 @@ from atom.utils.constants import PALETTE from atom.utils.types import ( Bool, FloatLargerZero, FloatZeroToOneExc, Int, IntLargerZero, Legend, - MetricSelector, Model, ModelsSelector, PlotBackend, RowSelector, Scalar, - Sequence, int_t, sequence_t, Pandas + MetricSelector, Model, ModelsSelector, Pandas, PlotBackend, RowSelector, + Scalar, Sequence, int_t, sequence_t, ) from atom.utils.utils import ( Aesthetics, check_is_fitted, composed, crash, get_custom_scorer, lst, diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py index 4ccb1782e..cd0386a22 100644 --- a/atom/plots/predictionplot.py +++ b/atom/plots/predictionplot.py @@ -39,7 +39,7 @@ Bool, ColumnSelector, FloatZeroToOneExc, Int, IntLargerEqualZero, IntLargerFour, IntLargerZero, Kind, Legend, MetricConstructor, MetricSelector, ModelsSelector, RowSelector, Sequence, TargetSelector, - TargetsSelector, XSelector, XConstructor + TargetsSelector, XConstructor, ) from atom.utils.utils import ( Task, check_canvas, check_dependency, check_empty, check_predict_proba, diff --git a/atom/utils/types.py b/atom/utils/types.py index 3c508e710..32c1ae330 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -7,7 +7,9 @@ from __future__ import annotations +import os from collections.abc import Callable, Hashable, Iterable, Iterator +from importlib.util import find_spec from typing import ( TYPE_CHECKING, Annotated, Any, Literal, NamedTuple, SupportsIndex, TypeAlias, TypedDict, TypeVar, overload, runtime_checkable, @@ -15,7 +17,6 @@ import numpy as np import pandas as pd -import scipy.sparse as sps from beartype.door import is_bearable from beartype.typing import Protocol from beartype.vale import Is @@ -28,6 +29,10 @@ from atom.utils.utils import Goal +# Avoid warning about pyarrow timezones not set +os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" + + # Classes for type hinting ========================================= >> _T = TypeVar("_T") @@ -133,6 +138,23 @@ class SPTuple(NamedTuple): trend_model: SeasonalityModels = "additive" +@runtime_checkable +class SparseMatrix(Protocol): + """Protocol for sparse matrices. + + Required since scipy doesn't have stubs. + + """ + + def _bsr_container(self): ... + def _coo_container(self): ... + def _csc_container(self): ... + def _csr_container(self): ... + def _dia_container(self): ... + def _dok_container(self): ... + def _lil_container(self): ... + + @runtime_checkable class SkScorer(Protocol): """Protocol for sklearn's scorers.""" @@ -216,7 +238,8 @@ def predict(self, *args, **kwargs) -> Pandas: ... | Sequence[Sequence[Any]] | Iterable[Sequence[Any] | tuple[Hashable, Sequence[Any]] | dict[str, Sequence[Any]]] | np.ndarray - | sps.spmatrix # scipy has no stubs, thus this becomes Any + | SparseMatrix + | pd.Series | pd.DataFrame ) XSelector: TypeAlias = XConstructor | Callable[..., XConstructor] @@ -224,10 +247,6 @@ def predict(self, *args, **kwargs) -> Pandas: ... YSelector: TypeAlias = Int | str | YConstructor FHConstructor: TypeAlias = Int | Sequence[Int] | ForecastingHorizon -# Return types for transform methods -TReturn: TypeAlias = np.ndarray | sps.spmatrix | Sequence[Any] | pd.DataFrame -TReturns: TypeAlias = TReturn | tuple[TReturn, TReturn] - # Selection of rows or columns by name or position ColumnSelector: TypeAlias = Int | str | Segment | Sequence[Int | str] | pd.DataFrame RowSelector: TypeAlias = Hashable | Sequence[Hashable] | ColumnSelector @@ -378,6 +397,74 @@ def predict(self, *args, **kwargs) -> Pandas: ... | Sequence[IntLargerEqualZero] ) +# Return types for transform methods +if TYPE_CHECKING: + import polars as pl + import pyarrow as pa + import modin.pandas as md + import dask.dataframe as dd + import pyspark.pandas as ps + from pyspark.sql import DataFrame as SparkDataFrame + + XReturn: TypeAlias = ( + Sequence[Sequence[Any]] + | np.ndarray + | SparseMatrix + | pd.DataFrame + | pl.DataFrame + | pl.LazyFrame + | pa.Table + | md.DataFrame + | dd.DataFrame + | SparkDataFrame + ) + YReturn: TypeAlias = ( + Sequence[Any] + | np.ndarray + | pd.Series + | pl.Series + | pa.Array + | md.Series + | dd.Series + | ps.Series + ) +else: + XReturn: TypeAlias = Sequence[Sequence[Any]] | np.ndarray | SparseMatrix | pd.DataFrame + YReturn: TypeAlias = Sequence[Any] | np.ndarray | pd.Series + + if find_spec("polars"): + import polars as pl + + XReturn = XReturn | pl.DataFrame | pl.LazyFrame + YReturn = YReturn | pl.Series + + if find_spec("pyarrow"): + import pyarrow as pa + + XReturn = XReturn | pa.Table + YReturn = YReturn | pa.Array + + if find_spec("modin"): + import modin.pandas as md + + XReturn = XReturn | md.DataFrame + YReturn = YReturn | md.Series + + if find_spec("dask"): + import dask.dataframe as dd + + XReturn = XReturn | dd.DataFrame + YReturn = YReturn | dd.Series + + if find_spec("pyspark"): + import pyspark.pandas as ps + from pyspark.sql import DataFrame as SparkDataFrame + + XReturn = XReturn | SparkDataFrame | ps.DataFrame + YReturn = YReturn | SparkDataFrame | ps.Series + + YReturn = YReturn | XReturn + # Variable types for isinstance ================================== >> diff --git a/atom/utils/utils.py b/atom/utils/utils.py index 2ba27461c..5be2e8892 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -22,7 +22,7 @@ from inspect import Parameter, signature from itertools import cycle from types import GeneratorType, MappingProxyType -from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload +from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload, Hashable import numpy as np import pandas as pd @@ -43,11 +43,11 @@ from atom.utils.constants import CAT_TYPES, __version__ from atom.utils.types import ( - Bool, Estimator, FeatureNamesOut, Float, IndexSelector, Int, + Bool, EngineTuple, Estimator, FeatureNamesOut, Float, IndexSelector, Int, IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar, - Scorer, Segment, Sequence, SPTuple, Transformer, TReturn, TReturns, - Verbose, XConstructor, XSelector, YConstructor, int_t, segment_t, - sequence_t, EngineTuple + Scorer, Segment, Sequence, SPTuple, Transformer, Verbose, + XConstructor, XReturn, YConstructor, int_t, segment_t, + sequence_t, EngineDataOptions ) @@ -1385,21 +1385,21 @@ def get_nan(dtype: Dtype) -> float | NAType: ) -def n_cols(obj: XConstructor | YConstructor) -> int: +def n_cols(obj: YConstructor | None) -> int: """Get the number of columns in a dataset. Parameters ---------- - obj: dict, sequence or dataframe-like + obj: dict, sequence, dataframe-like or None Dataset to check. Returns ------- - int or None + int Number of columns. """ - if hasattr(obj, "shape"): + if obj is not None and hasattr(obj, "shape"): return obj.shape[1] if len(obj.shape) > 1 else 1 elif isinstance(obj, dict): return 2 # Dict always goes to dataframe @@ -1809,8 +1809,8 @@ def to_df( @overload def to_df( data: XConstructor, - index: Axes | None, - columns: Axes | None, + index: Axes | None = ..., + columns: Axes | None = ..., ) -> pd.DataFrame: ... @@ -1853,13 +1853,14 @@ def to_df( columns = [f"x{i}" for i in range(n_cols(data))] if sps.issparse(data): - data_c = pd.DataFrame.sparse.from_spmatrix( - data=data, + data_c = pd.DataFrame.sparse.from_spmatrix(data, index, columns) + else: + data_c = pd.DataFrame( + data=data, # type: ignore[misc, arg-type] index=index, columns=columns, + copy=True, ) - else: - data_c = pd.DataFrame(data, index=index, columns=columns, copy=True) # If text dataset, change the name of the column to corpus if list(data_c.columns) == ["x0"] and data_c.dtypes[0].name in CAT_TYPES: @@ -1879,7 +1880,8 @@ def to_df( except KeyError: raise ValueError( f"The columns are different than seen at fit time. Features " - f"{set(data_c.columns) - set(columns)} are missing in X." + f"{set(data_c.columns) - set(columns)} " # type: ignore[arg-type] + "are missing in X." ) from None return data_c @@ -1994,12 +1996,12 @@ def to_tabular( """ if (n_targets := n_cols(data)) == 1: - return to_series(data, index=index, name=flt(columns)) + return to_series(data, index=index, name=flt(columns)) # type: ignore[misc, arg-type] else: if columns is None and not hasattr(data, "__dataframe__"): columns = [f"y{i}" for i in range(n_targets)] - return to_df(data, index=index, columns=columns) + return to_df(data, index=index, columns=columns) # type: ignore[misc, arg-type] def check_is_fitted( @@ -2144,10 +2146,10 @@ def get_custom_scorer(metric: str | MetricFunction | Scorer) -> Scorer: # Pipeline functions =============================================== >> def name_cols( - array: TReturn, + df: pd.DataFrame, original_df: pd.DataFrame, col_names: list[str], -) -> list[str]: +) -> pd.Index: """Get the column names after a transformation. If the number of columns is unchanged, the original @@ -2156,7 +2158,7 @@ def name_cols( Parameters ---------- - array: np.ndarray, sps.matrix, pd.Series or pd.DataFrame + df: pd.DataFrame Transformed dataset. original_df: pd.DataFrame @@ -2167,23 +2169,23 @@ def name_cols( Returns ------- - list of str + pd.Index Column names. """ # If columns were only transformed, return og names - if array.shape[1] == len(col_names): - return col_names + if df.shape[1] == len(col_names): + return pd.Index(col_names) # If columns were added or removed temp_cols = [] - for i, col in enumerate(array.T): + for i, (name, column) in enumerate(df.items()): # equal_nan=True fails for non-numeric dtypes - mask = original_df.apply( + mask = original_df.apply( # type: ignore[type-var] lambda c: np.array_equal( a1=c, - a2=col, - equal_nan=is_numeric_dtype(c) and np.issubdtype(col.dtype, np.number), + a2=str(name), + equal_nan=is_numeric_dtype(c) and np.issubdtype(column.dtype, np.number), ), ) @@ -2201,7 +2203,7 @@ def name_cols( else: counter += 1 - return temp_cols + return pd.Index(temp_cols) def get_col_order( @@ -2351,7 +2353,7 @@ def fit_one( with _print_elapsed_time("Pipeline", message): if hasattr(estimator, "fit"): - kwargs = {} + kwargs: dict[str, Pandas] = {} inc = getattr(estimator, "_cols", getattr(Xt, "columns", [])) if "X" in (params := sign(estimator.fit)): if Xt is not None and (cols := [c for c in inc if c in Xt]): @@ -2425,12 +2427,12 @@ def transform_one( """ - def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: + def prepare_df(out: XConstructor, og: pd.DataFrame) -> pd.DataFrame: """Convert to df and set the correct column names. Parameters ---------- - out: np.ndarray, sps.matrix or pd.DataFrame + out: dataframe-like Data returned by the transformation. og: pd.DataFrame @@ -2442,24 +2444,21 @@ def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: Transformed dataset. """ - use_cols = [c for c in inc if c in og.columns] + out_c = to_df(out, index=og.index) - # Convert to pandas and assign proper column names + # Assign proper column names + use_cols = [c for c in inc if c in og.columns] if not isinstance(out, pd.DataFrame): if hasattr(transformer, "get_feature_names_out"): - columns = transformer.get_feature_names_out() + out_c.columns = transformer.get_feature_names_out() else: - columns = name_cols(out, og, use_cols) - else: - columns = out.columns - - out = to_df(out, index=og.index, columns=columns) + out_c.columns = name_cols(out_c, og, use_cols) # Reorder columns if only a subset was used if len(use_cols) != og.shape[1]: - return reorder_cols(transformer, out, og, use_cols) + return reorder_cols(transformer, out_c, og, use_cols) else: - return out + return out_c Xt = to_df(X) yt = to_tabular(y, index=getattr(Xt, "index", None)) @@ -2489,22 +2488,24 @@ def prepare_df(out: TReturn, og: pd.DataFrame) -> pd.DataFrame: elif "X" not in params: return Xt, yt # If y is None and no X in transformer, skip the transformer - out: TReturns = getattr(transformer, method)(**kwargs, **transform_params) + out: YConstructor | tuple[XConstructor, YConstructor] = getattr(transformer, method)(**kwargs, **transform_params) # Transform can return X, y or both - if isinstance(out, tuple): + X_new: pd.DataFrame | None + y_new: Pandas | None + if isinstance(out, tuple) and Xt is not None: X_new = prepare_df(out[0], Xt) y_new = to_tabular(out[1], index=X_new.index) - if isinstance(yt, pd.DataFrame): + if isinstance(yt, pd.DataFrame) and isinstance(y_new, pd.DataFrame): y_new = prepare_df(y_new, yt) elif "X" in params and Xt is not None and any(c in Xt for c in inc): # X in -> X out - X_new = prepare_df(out, Xt) + X_new = prepare_df(out, Xt) # type: ignore[arg-type] y_new = yt if yt is None else yt.set_axis(X_new.index, axis=0) elif y is not None: y_new = to_tabular(out) X_new = Xt if Xt is None else Xt.set_index(y_new.index) - if isinstance(yt, pd.DataFrame): + if isinstance(yt, pd.DataFrame) and isinstance(y_new, pd.DataFrame): y_new = prepare_df(y_new, yt) return X_new, y_new diff --git a/tests/test_atom.py b/tests/test_atom.py index dcc430e5c..4fc68e66e 100644 --- a/tests/test_atom.py +++ b/tests/test_atom.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest from category_encoders.target_encoder import TargetEncoder from pandas.testing import assert_frame_equal, assert_index_equal @@ -22,7 +23,6 @@ from sklearn.preprocessing import ( LabelEncoder, MultiLabelBinarizer, OneHotEncoder, StandardScaler, ) -import pyarrow as pa from sktime.transformations.series.impute import Imputer from sktime.transformations.series.summarize import WindowSummarizer @@ -454,6 +454,7 @@ def test_shrink_pyarrow(): atom = ATOMClassifier(X_pa, y_bin, engine="pandas-pyarrow", random_state=1) assert atom.dtypes[0].name == "double[pyarrow]" atom.shrink() + print(atom.branch.dataset.dtypes[0]) assert atom.dtypes[0].name == "float[pyarrow]" diff --git a/tests/test_basemodel.py b/tests/test_basemodel.py index beba798ca..fb1e81183 100644 --- a/tests/test_basemodel.py +++ b/tests/test_basemodel.py @@ -10,9 +10,9 @@ import numpy as np import pandas as pd +import polars as pl import pytest import requests -import polars as pl from optuna.distributions import CategoricalDistribution, IntDistribution from optuna.pruners import PatientPruner from optuna.samplers import NSGAIISampler diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index fa9b948af..d87681fc0 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -7,8 +7,8 @@ import numpy as np import pandas as pd -import pytest import pyarrow as pa +import pytest from pandas.testing import assert_frame_equal from sklearn.preprocessing import LabelEncoder, StandardScaler from sktime.proba.normal import Normal From 043aa7d398518f752395199a4565f8998a985b9d Mon Sep 17 00:00:00 2001 From: Marco van den Boom Date: Tue, 20 Feb 2024 16:20:43 +0100 Subject: [PATCH 08/12] mypy fix basemodel --- atom/atom.py | 4 +-- atom/basemodel.py | 70 +++++++++++++++++++++++----------------- atom/basetransformer.py | 21 ++++++------ atom/data/dataengines.py | 7 +--- 4 files changed, 55 insertions(+), 47 deletions(-) diff --git a/atom/atom.py b/atom/atom.py index c6852300c..cad84549f 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -1332,8 +1332,8 @@ def _add_transformer( ) ) if self.branch._holdout is not None: - self.branch._holdout.index = range( - len(data), len(data) + len(self.branch._holdout) + self.branch._holdout.index = pd.Index( + range(len(data), len(data) + len(self.branch._holdout)) ) elif self.branch.dataset.index.duplicated().any(): raise ValueError( diff --git a/atom/basemodel.py b/atom/basemodel.py index 4bab6b270..7a7745fb1 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -2525,7 +2525,7 @@ def _prediction( y: YSelector | None = ..., metric: str | MetricFunction | Scorer | None = ..., sample_weight: Sequence[Scalar] | None = ..., - verbose: Int | None = ..., + verbose: Verbose | None = ..., method: Literal["score"] = ..., ) -> Float: ... @@ -2536,8 +2536,13 @@ def _prediction( y: YSelector | None = ..., metric: str | MetricFunction | Scorer | None = ..., sample_weight: Sequence[Scalar] | None = ..., - verbose: Int | None = ..., - method: PredictionMethods = ..., + verbose: Verbose | None = ..., + method: Literal[ + "decision_function", + "predict", + "predict_log_proba", + "predict_proba", + ] = ..., ) -> Pandas: ... def _prediction( @@ -2546,7 +2551,7 @@ def _prediction( y: YSelector | None = None, metric: str | MetricFunction | Scorer | None = None, sample_weight: Sequence[Scalar] | None = None, - verbose: Int | None = None, + verbose: Verbose | None = None, method: PredictionMethods = "predict", ) -> Float | Pandas: """Get predictions on new data or existing rows. @@ -2701,7 +2706,7 @@ def decision_function( self, X: RowSelector | XSelector, *, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> YReturn: """Get confidence scores on new data or existing rows. @@ -2740,7 +2745,7 @@ def predict( X: RowSelector | XSelector, *, inverse: Bool = True, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> YReturn: """Get predictions on new data or existing rows. @@ -2787,7 +2792,7 @@ def predict_log_proba( self, X: RowSelector | XSelector, *, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> XReturn: """Get class log-probabilities on new data or existing rows. @@ -2824,7 +2829,7 @@ def predict_proba( self, X: RowSelector | XSelector, *, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> XReturn: """Get class probabilities on new data or existing rows. @@ -2864,7 +2869,7 @@ def score( *, metric: str | MetricFunction | Scorer | None = None, sample_weight: Sequence[Scalar] | None = None, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> Float: """Get a metric score on new data. @@ -2957,11 +2962,11 @@ def get_tags(self) -> dict[str, Any]: @overload def _prediction( self, - fh: RowSelector | FHConstructor | None = None, - y: RowSelector | YSelector | None = None, - X: XSelector | None = None, - metric: str | MetricFunction | Scorer | None = None, - verbose: Int | None = None, + fh: RowSelector | FHConstructor | None = ..., + y: RowSelector | YSelector | None = ..., + X: XSelector | None = ..., + metric: str | MetricFunction | Scorer | None = ..., + verbose: Verbose | None = ..., method: Literal["score"] = ..., **kwargs, ) -> Float: ... @@ -2969,14 +2974,21 @@ def _prediction( @overload def _prediction( self, - fh: RowSelector | FHConstructor | None = None, - y: RowSelector | YSelector | None = None, - X: XSelector | None = None, - metric: str | MetricFunction | Scorer | None = None, - verbose: Int | None = None, - method: PredictionMethodsTS = ..., + fh: RowSelector | FHConstructor | None = ..., + y: RowSelector | YSelector | None = ..., + X: XSelector | None = ..., + metric: str | MetricFunction | Scorer | None = ..., + verbose: Verbose | None = ..., + method: Literal[ + "predict", + "predict_interval", + "predict_proba", + "predict_quantiles", + "predict_residuals", + "predict_var", + ] = ..., **kwargs, - ) -> Pandas: ... + ) -> Normal | Pandas: ... def _prediction( self, @@ -2984,7 +2996,7 @@ def _prediction( y: RowSelector | YSelector | None = None, X: XSelector | None = None, metric: str | MetricFunction | Scorer | None = None, - verbose: Int | None = None, + verbose: Verbose | None = None, method: PredictionMethodsTS = "predict", **kwargs, ) -> Float | Normal | Pandas: @@ -3067,7 +3079,7 @@ def predict( X: XSelector | None = None, *, inverse: Bool = True, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> YReturn: """Get predictions on new data or existing rows. @@ -3118,7 +3130,7 @@ def predict_interval( X: XSelector | None = None, *, coverage: Float | Sequence[Float] = 0.9, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> XReturn: """Get prediction intervals on new data or existing rows. @@ -3168,7 +3180,7 @@ def predict_proba( X: XSelector | None = None, *, marginal: Bool = True, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> Normal: """Get probabilistic forecasts on new data or existing rows. @@ -3216,7 +3228,7 @@ def predict_quantiles( X: XSelector | None = None, *, alpha: Float | Sequence[Float] = (0.05, 0.95), - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> XReturn: """Get quantile forecasts on new data or existing rows. @@ -3266,7 +3278,7 @@ def predict_residuals( y: RowSelector | YSelector, X: XSelector | None = None, *, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> YReturn: """Get residuals of forecasts on new data or existing rows. @@ -3307,7 +3319,7 @@ def predict_var( X: XSelector | None = None, *, cov: Bool = False, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> XReturn: """Get variance forecasts on new data or existing rows. @@ -3359,7 +3371,7 @@ def score( fh: RowSelector | FHConstructor | None = None, *, metric: str | MetricFunction | Scorer | None = None, - verbose: Int | None = None, + verbose: Verbose | None = None, ) -> Float: """Get a metric score on new data. diff --git a/atom/basetransformer.py b/atom/basetransformer.py index ea648bdf9..a38b621c8 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -10,6 +10,7 @@ import os import random import re +from pandas._typing import Axes import tempfile import warnings from collections.abc import Hashable @@ -367,21 +368,21 @@ def _device_id(self) -> int: @staticmethod @overload def _check_input( - X: XSelector, + X: XSelector | None, y: Literal[None], *, - columns: Sequence[str] | None = None, - name: str | Sequence[str] | None = None, + columns: Axes | None = ..., + name: str | Axes | None = ..., ) -> tuple[pd.DataFrame, None]: ... @staticmethod @overload def _check_input( X: Literal[None], - y: YSelector, + y: YSelector | None = ..., *, - columns: Sequence[str] | None = None, - name: str | Sequence[str] | None = None, + columns: Axes | None = ..., + name: str | Axes | None = ..., ) -> tuple[None, Pandas]: ... @staticmethod @@ -390,8 +391,8 @@ def _check_input( X: XSelector, y: YSelector, *, - columns: Sequence[str] | None = None, - name: str | Sequence[str] | None = None, + columns: Axes | None = ..., + name: str | Axes | None = ..., ) -> tuple[pd.DataFrame, Pandas]: ... @staticmethod @@ -399,8 +400,8 @@ def _check_input( X: XSelector | None = None, y: YSelector | None = None, *, - columns: Sequence[str] | None = None, - name: str | Sequence[str] | None = None, + columns: Axes | None = None, + name: str | Axes | None = None, ) -> tuple[pd.DataFrame | None, Pandas | None]: """Prepare the input data. diff --git a/atom/data/dataengines.py b/atom/data/dataengines.py index 5bb0e080f..1f90d77eb 100644 --- a/atom/data/dataengines.py +++ b/atom/data/dataengines.py @@ -65,11 +65,6 @@ class PandasPyarrowEngine(DataEngine): library = "pandas" - def _to_numpy_dtype(self, dtype: np.dtype) -> pa.DataType: - """Convert numpy dtype to pyarrow dtype.""" - if isinstance(dtype, np.dtype): - return pa.from_numpy_dtype(dtype) # TODO: Handle numpy nullable types - @staticmethod def convert(obj: Pandas) -> Pandas: """Convert to pyarrow dtypes.""" @@ -78,7 +73,7 @@ def convert(obj: Pandas) -> Pandas: if isinstance(obj, pd.DataFrame): return obj.astype( { - c: pd.ArrowDtype(from_numpy_dtype(d)) if isinstance(d, np.dtype) else d + c: pd.ArrowDtype(from_numpy_dtype(getattr(d, "numpy_dtype", d))) for c, d in obj.dtypes.items() } ) From 3a51c67996159e2ea209722758e7ee1601f6783b Mon Sep 17 00:00:00 2001 From: Mavs Date: Fri, 23 Feb 2024 22:42:00 +0100 Subject: [PATCH 09/12] fix partial tests --- atom/atom.py | 8 ++++---- atom/basemodel.py | 2 +- atom/basetransformer.py | 5 ++--- atom/data_cleaning.py | 18 +++++++++++------- atom/feature_engineering.py | 4 ++-- atom/nlp.py | 4 ++-- atom/utils/types.py | 4 ++-- atom/utils/utils.py | 15 +++++++-------- tests/test_atom.py | 1 - 9 files changed, 31 insertions(+), 30 deletions(-) diff --git a/atom/atom.py b/atom/atom.py index cad84549f..974838f17 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -53,10 +53,10 @@ FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, IndexSelector, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, MetricConstructor, ModelsConstructor, NItems, NJobs, - NormalizerStrats, NumericalStrats, Operators, Predictor, - PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality, Sequence, - SPDict, TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings, - XReturn, XSelector, YReturn, YSelector, sequence_t, + NormalizerStrats, NumericalStrats, Operators, Predictor, PrunerStrats, + RowSelector, Scalar, ScalerStrats, Seasonality, Sequence, SPDict, + TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings, XReturn, + XSelector, YReturn, YSelector, sequence_t, ) from atom.utils.utils import ( ClassMap, DataConfig, DataContainer, Goal, adjust, check_dependency, diff --git a/atom/basemodel.py b/atom/basemodel.py index 7a7745fb1..afd5388ac 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -274,7 +274,7 @@ def __init__( self._train_idx = len(self.branch._data.train_idx) # Can change for sh and ts if getattr(self, "needs_scaling", None) and not self.branch.check_scaling(): - self.scaler = Scaler().fit(self.X_train) + self.scaler = Scaler(engine=self.engine).fit(self.X_train) def __repr__(self) -> str: """Display class name.""" diff --git a/atom/basetransformer.py b/atom/basetransformer.py index a38b621c8..4e3deb768 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -10,7 +10,6 @@ import os import random import re -from pandas._typing import Axes import tempfile import warnings from collections.abc import Hashable @@ -28,13 +27,13 @@ import pandas as pd from beartype import beartype from joblib.memory import Memory +from pandas._typing import Axes from sklearn.utils.validation import check_memory from atom.utils.types import ( Backend, Bool, Engine, EngineDataOptions, EngineEstimatorOptions, EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Pandas, - Sequence, Severity, Verbose, Warnings, XSelector, YSelector, - bool_t, int_t, + Severity, Verbose, Warnings, XSelector, YSelector, bool_t, int_t, ) from atom.utils.utils import ( check_dependency, crash, lst, make_sklearn, to_df, to_tabular, diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index 282a8dd91..89168d945 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -40,13 +40,17 @@ from sklearn.experimental import enable_iterative_imputer # noqa: F401 from sklearn.impute import IterativeImputer, KNNImputer from sklearn.utils.validation import _check_feature_names_in +from sktime.transformations.series.detrend import ( + ConditionalDeseasonalizer, Deseasonalizer, Detrender, +) from sktime.transformations.series.impute import Imputer from typing_extensions import Self from atom.basetransformer import BaseTransformer from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING from atom.utils.types import ( - Bins, Bool, CategoricalStrats, DiscretizerStrats, EngineDataOptions, EngineEstimatorOptions, EngineTuple, Estimator, + Bins, Bool, CategoricalStrats, DiscretizerStrats, Engine, + EngineDataOptions, EngineTuple, Estimator, FloatLargerZero, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, SeasonalityModels, Sequence, Transformer, Verbose, @@ -711,7 +715,7 @@ def __init__( drop_missing_target: Bool = True, encode_target: Bool = True, device: str = "cpu", - engine: EngineEstimatorOptions = None, + engine: Engine = None, verbose: Verbose = 0, ): super().__init__(device=device, engine=engine, verbose=verbose) @@ -1405,7 +1409,7 @@ def __init__( bins: Bins = 5, labels: Sequence[str] | dict[str, Sequence[str]] | None = None, device: str = "cpu", - engine: EngineEstimatorOptions = None, + engine: Engine = None, verbose: Verbose = 0, random_state: IntLargerEqualZero | None = None, ): @@ -2120,7 +2124,7 @@ def __init__( max_nan_cols: FloatLargerZero | None = None, n_jobs: NJobs = 1, device: str = "cpu", - engine: EngineEstimatorOptions = None, + engine: Engine = None, verbose: Verbose = 0, random_state: IntLargerEqualZero | None = None, ): @@ -2512,7 +2516,7 @@ def __init__( strategy: NormalizerStrats = "yeojohnson", *, device: str = "cpu", - engine: EngineEstimatorOptions = None, + engine: Engine = None, verbose: Verbose = 0, random_state: IntLargerEqualZero | None = None, **kwargs, @@ -2790,7 +2794,7 @@ def __init__( max_sigma: FloatLargerZero = 3, include_target: Bool = False, device: str = "cpu", - engine: EngineEstimatorOptions = None, + engine: Engine = None, verbose: Verbose = 0, **kwargs, ): @@ -3051,7 +3055,7 @@ def __init__( *, include_binary: Bool = False, device: str = "cpu", - engine: EngineEstimatorOptions = None, + engine: Engine = None, verbose: Verbose = 0, **kwargs, ): diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index d06c39a5b..ccd15d94b 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -33,7 +33,7 @@ from atom.basetransformer import BaseTransformer from atom.data_cleaning import Scaler, TransformerMixin from atom.utils.types import ( - Bool, EngineEstimatorOptions, FeatureSelectionSolvers, + Bool, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, IntLargerEqualZero, IntLargerZero, NJobs, Operators, Scalar, Sequence, Verbose, XConstructor, YConstructor, @@ -985,7 +985,7 @@ def __init__( max_correlation: FloatZeroToOneInc | None = 1.0, n_jobs: NJobs = 1, device: str = "cpu", - engine: EngineEstimatorOptions = None, + engine: Engine = None, verbose: Verbose = 0, random_state: IntLargerEqualZero | None = None, **kwargs, diff --git a/atom/nlp.py b/atom/nlp.py index 85154552b..29aedd362 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -21,7 +21,7 @@ from atom.data_cleaning import TransformerMixin from atom.utils.types import ( - Bool, EngineEstimatorOptions, FloatLargerZero, Sequence, + Bool, Engine, FloatLargerZero, Sequence, VectorizerStarts, Verbose, XConstructor, YConstructor, bool_t, ) from atom.utils.utils import ( @@ -886,7 +886,7 @@ def __init__( *, return_sparse: Bool = True, device: str = "cpu", - engine: EngineEstimatorOptions = None, + engine: Engine = None, verbose: Verbose = 0, **kwargs, ): diff --git a/atom/utils/types.py b/atom/utils/types.py index 32c1ae330..798d09c06 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -399,10 +399,10 @@ def predict(self, *args, **kwargs) -> Pandas: ... # Return types for transform methods if TYPE_CHECKING: + import dask.dataframe as dd + import modin.pandas as md import polars as pl import pyarrow as pa - import modin.pandas as md - import dask.dataframe as dd import pyspark.pandas as ps from pyspark.sql import DataFrame as SparkDataFrame diff --git a/atom/utils/utils.py b/atom/utils/utils.py index 5be2e8892..61b4e37e9 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -22,7 +22,7 @@ from inspect import Parameter, signature from itertools import cycle from types import GeneratorType, MappingProxyType -from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload, Hashable +from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload import numpy as np import pandas as pd @@ -43,11 +43,10 @@ from atom.utils.constants import CAT_TYPES, __version__ from atom.utils.types import ( - Bool, EngineTuple, Estimator, FeatureNamesOut, Float, IndexSelector, Int, - IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar, - Scorer, Segment, Sequence, SPTuple, Transformer, Verbose, - XConstructor, XReturn, YConstructor, int_t, segment_t, - sequence_t, EngineDataOptions + Bool, EngineDataOptions, EngineTuple, Estimator, FeatureNamesOut, Float, + IndexSelector, Int, IntLargerEqualZero, MetricFunction, Model, Pandas, + Predictor, Scalar, Scorer, Segment, Sequence, SPTuple, Transformer, + Verbose, XConstructor, YConstructor, int_t, segment_t, sequence_t, ) @@ -2184,9 +2183,9 @@ def name_cols( mask = original_df.apply( # type: ignore[type-var] lambda c: np.array_equal( a1=c, - a2=str(name), + a2=column, equal_nan=is_numeric_dtype(c) and np.issubdtype(column.dtype, np.number), - ), + ) ) if any(mask) and mask[mask].index[0] not in temp_cols: diff --git a/tests/test_atom.py b/tests/test_atom.py index 4fc68e66e..f04906c3e 100644 --- a/tests/test_atom.py +++ b/tests/test_atom.py @@ -454,7 +454,6 @@ def test_shrink_pyarrow(): atom = ATOMClassifier(X_pa, y_bin, engine="pandas-pyarrow", random_state=1) assert atom.dtypes[0].name == "double[pyarrow]" atom.shrink() - print(atom.branch.dataset.dtypes[0]) assert atom.dtypes[0].name == "float[pyarrow]" From 63ac60d16a69c95b76c9f385e52b31013992a7d4 Mon Sep 17 00:00:00 2001 From: Mavs Date: Sun, 25 Feb 2024 12:40:38 +0100 Subject: [PATCH 10/12] fixing type hints --- atom/basemodel.py | 27 +++-- atom/baserunner.py | 18 ++-- atom/basetransformer.py | 54 +++++----- atom/data/dataengines.py | 9 +- atom/data_cleaning.py | 197 ++++++++++++++++++++++-------------- atom/feature_engineering.py | 79 +++++++-------- atom/nlp.py | 14 +-- atom/pipeline.py | 141 +++++++++++++++++--------- atom/utils/types.py | 30 ++++-- atom/utils/utils.py | 122 +++++++++++----------- pyproject.toml | 1 + tests/conftest.py | 4 +- tests/test_data.py | 7 +- 13 files changed, 402 insertions(+), 301 deletions(-) diff --git a/atom/basemodel.py b/atom/basemodel.py index afd5388ac..6fb2ed6d4 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -274,7 +274,7 @@ def __init__( self._train_idx = len(self.branch._data.train_idx) # Can change for sh and ts if getattr(self, "needs_scaling", None) and not self.branch.check_scaling(): - self.scaler = Scaler(engine=self.engine).fit(self.X_train) + self.scaler = Scaler(device=self.device, engine=self.engine).fit(self.X_train) def __repr__(self) -> str: """Display class name.""" @@ -2664,19 +2664,19 @@ def assign_prediction_columns() -> list[str]: pred = np.array(self.memory.cache(getattr(self.estimator, method))(Xt[self.features])) if pred.ndim == 1 or pred.shape[1] == 1: - data = to_series(pred, index=Xt.index, name=self.target) + return to_series(pred, index=Xt.index, name=self.target) elif pred.ndim < 3: - data = to_df(pred, index=Xt.index, columns=assign_prediction_columns()) + return to_df(pred, index=Xt.index, columns=assign_prediction_columns()) elif self.task is Task.multilabel_classification: # Convert to (n_samples, n_targets) - data = pd.DataFrame( + return pd.DataFrame( data=np.array([d[:, 1] for d in pred]).T, index=Xt.index, columns=assign_prediction_columns(), ) else: # Convert to (n_samples * n_classes, n_targets) - data = pd.DataFrame( + return pd.DataFrame( data=pred.reshape(-1, pred.shape[2]), index=pd.MultiIndex.from_tuples( [(col, idx) for col in np.unique(self.y) for idx in Xt.index] @@ -2684,8 +2684,6 @@ def assign_prediction_columns() -> list[str]: columns=assign_prediction_columns(), ) - return data - else: if metric is None: scorer = self._metric[0] @@ -2971,6 +2969,18 @@ def _prediction( **kwargs, ) -> Float: ... + @overload + def _prediction( + self, + fh: RowSelector | FHConstructor | None = ..., + y: RowSelector | YSelector | None = ..., + X: XSelector | None = ..., + metric: str | MetricFunction | Scorer | None = ..., + verbose: Verbose | None = ..., + method: Literal["predict_proba"] = ..., + **kwargs, + ) -> Normal: ... + @overload def _prediction( self, @@ -2982,13 +2992,12 @@ def _prediction( method: Literal[ "predict", "predict_interval", - "predict_proba", "predict_quantiles", "predict_residuals", "predict_var", ] = ..., **kwargs, - ) -> Normal | Pandas: ... + ) -> Pandas: ... def _prediction( self, diff --git a/atom/baserunner.py b/atom/baserunner.py index eb8a9a895..39541d169 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -562,7 +562,7 @@ def _no_data_sets( "Invalid value for the index parameter. Length of index " f"({len(index)}) doesn't match that of the dataset ({len(data)})." ) - data.index = index + data.index = pd.Index(index) if len(data) < 5: raise ValueError( @@ -722,10 +722,10 @@ def _has_data_sets( "Invalid value for the index parameter. Length of index " f"({len(index)}) doesn't match that of the data sets ({len_data})." ) - train.index = index[: len(train)] - test.index = index[len(train): len(train) + len(test)] + train.index = pd.Index(index[: len(train)]) + test.index = pd.Index(index[len(train): len(train) + len(test)]) if holdout is not None: - holdout.index = index[-len(holdout):] + holdout.index = pd.Index(index[-len(holdout):]) complete_set = _set_index(pd.concat([train, test, holdout]), y_test, index) @@ -746,7 +746,7 @@ def _has_data_sets( if len(arrays) == 0: if self.branch._container: return self.branch._data, self.branch._holdout - elif self._goal is Goal.forecast and not isinstance(y, Int | str): + elif self._goal is Goal.forecast and not isinstance(y, (*int_t, str)): # arrays=() and y=y for forecasting sets = _no_data_sets(*self._check_input(y=y)) else: @@ -1132,7 +1132,7 @@ def export_pipeline(self, model: str | Model | None = None) -> Pipeline: def get_class_weight( self, rows: RowSelector = "train", - ) -> dict[Hashable, float] | dict[str, dict[Hashable, float]]: + ) -> dict[Hashable, float] | dict[Hashable, dict[Hashable, float]]: """Return class weights for a balanced data set. Statistically, the class weights re-balance the data set so @@ -1173,10 +1173,10 @@ def get_weights(col: pd.Series) -> dict[Hashable, float]: _, y = self.branch._get_rows(rows, return_X_y=True) - if self.task.is_multioutput: - return {str(col.name): get_weights(col) for col in get_cols(y)} - else: + if isinstance(y, pd.Series): return get_weights(y) + else: + return {col.name: get_weights(col) for col in get_cols(y)} @available_if(has_task("classification")) @composed(crash, beartype) diff --git a/atom/basetransformer.py b/atom/basetransformer.py index 4e3deb768..19a005439 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -33,7 +33,7 @@ from atom.utils.types import ( Backend, Bool, Engine, EngineDataOptions, EngineEstimatorOptions, EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Pandas, - Severity, Verbose, Warnings, XSelector, YSelector, bool_t, int_t, + Severity, Verbose, Warnings, XSelector, YSelector, bool_t, int_t, YReturn, XReturn ) from atom.utils.utils import ( check_dependency, crash, lst, make_sklearn, to_df, to_tabular, @@ -367,8 +367,8 @@ def _device_id(self) -> int: @staticmethod @overload def _check_input( - X: XSelector | None, - y: Literal[None], + X: XSelector, + y: Literal[None] = ..., *, columns: Axes | None = ..., name: str | Axes | None = ..., @@ -378,7 +378,7 @@ def _check_input( @overload def _check_input( X: Literal[None], - y: YSelector | None = ..., + y: YSelector, *, columns: Axes | None = ..., name: str | Axes | None = ..., @@ -447,13 +447,30 @@ def _check_input( Xt = to_df(X() if callable(X) else X, columns=columns) # Prepare target column - if not isinstance(y, Int | str | None): + yt: Pandas | None + if y is None: + yt = None + elif isinstance(y, int_t): + if Xt is None: + raise ValueError("X can't be None when y is an int.") + + Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]] + elif isinstance(y, str): + if Xt is not None: + if y not in Xt.columns: + raise ValueError(f"Column {y} not found in X!") + + Xt, yt = Xt.drop(columns=y), Xt[y] + + else: + raise ValueError("X can't be None when y is a string.") + else: # If X and y have different number of rows, try multioutput if Xt is not None and not isinstance(y, dict) and len(Xt) != len(y): try: targets: list[Hashable] = [] for col in y: - if col in Xt.columns: + if isinstance(col, str) and col in Xt.columns: targets.append(col) elif isinstance(col, int_t): if -Xt.shape[1] <= col < Xt.shape[1]: @@ -479,27 +496,18 @@ def _check_input( if Xt is not None and not Xt.index.equals(yt.index): raise ValueError("X and y don't have the same indices!") - elif isinstance(y, str): - if Xt is not None: - if y not in Xt.columns: - raise ValueError(f"Column {y} not found in X!") - - Xt, yt = Xt.drop(columns=y), Xt[y] - - else: - raise ValueError("X can't be None when y is a string.") + return Xt, yt - elif isinstance(y, int_t): - if Xt is None: - raise ValueError("X can't be None when y is an int.") + @overload + def _convert(self, obj: Literal[None]) -> None: ... - Xt, yt = Xt.drop(columns=Xt.columns[int(y)]), Xt[Xt.columns[int(y)]] - else: - yt = y + @overload + def _convert(self, obj: pd.DataFrame) -> XReturn: ... - return Xt, yt + @overload + def _convert(self, obj: pd.Series) -> YReturn: ... - def _convert(self, obj: Any) -> Any: + def _convert(self, obj: Pandas | None) -> YReturn | None: """Convert data to the type set in the data engine. Non-pandas types are returned as is. diff --git a/atom/data/dataengines.py b/atom/data/dataengines.py index 1f90d77eb..74fb19594 100644 --- a/atom/data/dataengines.py +++ b/atom/data/dataengines.py @@ -21,6 +21,7 @@ import modin.pandas as md import polars as pl import pyarrow as pa + import pyspark.sql as psql import pyspark.pandas as ps @@ -167,7 +168,7 @@ class PySparkEngine(DataEngine): library = "pyspark" @staticmethod - def convert(obj: Pandas) -> ps.sql.DataFrame: + def convert(obj: Pandas) -> psql.DataFrame: """Convert to pyspark objects.""" from pyspark.sql import SparkSession @@ -181,14 +182,14 @@ class PySparkPandasEngine(DataEngine): library = "pyspark" @staticmethod - def convert(obj: Pandas) -> ps.pandas.Series | ps.pandas.DataFrame: + def convert(obj: Pandas) -> ps.Series | ps.DataFrame: """Convert to pyspark objects.""" import pyspark.pandas as ps if isinstance(obj, pd.DataFrame): - return ps.pandas.DataFrame(obj) + return ps.DataFrame(obj) else: - return ps.pandas.Series(obj) + return ps.Series(obj) DATA_ENGINES = { diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index 89168d945..f778e019d 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -10,7 +10,7 @@ import re from collections import defaultdict from collections.abc import Hashable -from typing import Any, Literal, TypeVar +from typing import Any, Literal, TypeVar, cast, overload import numpy as np import pandas as pd @@ -43,18 +43,18 @@ from sktime.transformations.series.detrend import ( ConditionalDeseasonalizer, Deseasonalizer, Detrender, ) -from sktime.transformations.series.impute import Imputer +from sktime.transformations.series.impute import Imputer as SktimeImputer from typing_extensions import Self from atom.basetransformer import BaseTransformer from atom.utils.constants import CAT_TYPES, DEFAULT_MISSING from atom.utils.types import ( Bins, Bool, CategoricalStrats, DiscretizerStrats, Engine, - EngineDataOptions, EngineTuple, Estimator, - FloatLargerZero, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, - NJobs, NormalizerStrats, NumericalStrats, Pandas, Predictor, PrunerStrats, - Scalar, ScalerStrats, SeasonalityModels, Sequence, Transformer, Verbose, - XConstructor, YConstructor, sequence_t, + EngineDataOptions, EngineTuple, Estimator, FloatLargerZero, Int, + IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, + NumericalStrats, Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, + SeasonalityModels, Sequence, Transformer, Verbose, XConstructor, + YConstructor, sequence_t, XReturn, YReturn, ) from atom.utils.utils import ( Goal, check_is_fitted, get_col_names, get_col_order, get_cols, it, lst, @@ -105,7 +105,12 @@ def __sklearn_clone__(self: T_Transformer) -> T_Transformer: return cloned - def fit(self, X=None, y=None, **fit_params) -> Self: + def fit( + self, + X: XConstructor | None = None, + y: YConstructor | None = None, + **fit_params, + ) -> Self: """Do nothing. Implemented for continuity of the API. @@ -138,12 +143,36 @@ def fit(self, X=None, y=None, **fit_params) -> Self: return self + @overload + def fit_transform( + self, + X: Literal[None], + y: YConstructor, + **fit_params, + ) -> YReturn: ... + + @overload + def fit_transform( + self, + X: XConstructor, + y: Literal[None] = ..., + **fit_params, + ) -> XReturn: ... + + @overload + def fit_transform( + self, + X: XConstructor, + y: YConstructor, + **fit_params, + ) -> tuple[XReturn, YReturn]: ... + def fit_transform( self, X: XConstructor | None = None, y: YConstructor | None = None, **fit_params, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Fit to data, then transform it. Parameters @@ -170,12 +199,36 @@ def fit_transform( """ return self.fit(X, y, **fit_params).transform(X, y) + @overload + def inverse_transform( + self, + X: Literal[None], + y: YConstructor, + **fit_params, + ) -> YReturn: ... + + @overload + def inverse_transform( + self, + X: XConstructor, + y: Literal[None] = ..., + **fit_params, + ) -> XReturn: ... + + @overload + def inverse_transform( + self, + X: XConstructor, + y: YConstructor, + **fit_params, + ) -> tuple[XReturn, YReturn]: ... + def inverse_transform( self, X: XConstructor | None = None, y: YConstructor | None = None, **fit_params, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Do nothing. Returns the input unchanged. Implemented for continuity of the @@ -202,12 +255,8 @@ def inverse_transform( """ check_is_fitted(self) - Xt = to_df(X, columns=self.feature_names_in_) - yt = to_tabular( - data=y, - index=getattr(Xt, "index", None), - columns=getattr(y, "target_names_in_", None), - ) + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) return variable_return(self._convert(Xt), self._convert(yt)) @@ -242,8 +291,11 @@ def set_output(self, *, transform: EngineDataOptions | None = None) -> Self: Estimator instance. """ + if not hasattr(self, "_engine"): + self.engine = EngineTuple() + if transform is not None: - self._engine = getattr(self, "_engine", EngineTuple()).data = transform + self.engine = EngineTuple(estimator=self.engine.estimator, data=transform) return self @@ -456,11 +508,11 @@ def fit(self, X: XConstructor, y: YConstructor) -> Self: # Create dict of class counts in y if not hasattr(self, "mapping_"): - self.mapping_ = {str(v): v for v in y.sort_values().unique()} + self.mapping_ = {str(v): v for v in yt.sort_values().unique()} self._counts = {} for key, value in self.mapping_.items(): - self._counts[key] = np.sum(y == value) + self._counts[key] = np.sum(yt == value) self._estimator = estimator.fit(Xt, yt) @@ -469,7 +521,7 @@ def fit(self, X: XConstructor, y: YConstructor) -> Self: return self - def transform(self, X: XConstructor, y: YConstructor) -> tuple[pd.DataFrame, pd.Series]: + def transform(self, X: XConstructor, y: YConstructor) -> tuple[XReturn, YReturn]: """Balance the data. Parameters @@ -492,7 +544,7 @@ def transform(self, X: XConstructor, y: YConstructor) -> tuple[pd.DataFrame, pd. check_is_fitted(self) Xt = to_df(X, columns=self.feature_names_in_) - yt = to_tabular(y, index=Xt.index, columns=self.target_names_in_) + yt = to_series(y, index=Xt.index, name=self.target_names_in_[0]) # type: ignore[arg-type] if "over_sampling" in self._estimator.__module__: self._log(f"Oversampling with {self._estimator.__class__.__name__}...", 1) @@ -511,8 +563,8 @@ def transform(self, X: XConstructor, y: YConstructor) -> tuple[pd.DataFrame, pd. ] # Assign the old + new indices - Xt.index = list(index) + list(n_idx) - yt.index = list(index) + list(n_idx) + Xt.index = pd.Index(list(index) + n_idx) + yt.index = pd.Index(list(index) + n_idx) self._log_changes(yt) @@ -522,8 +574,8 @@ def transform(self, X: XConstructor, y: YConstructor) -> tuple[pd.DataFrame, pd. self._estimator.fit_resample(Xt, yt) # Select chosen rows (imblearn doesn't return them in order) - samples = sorted(self._estimator.sample_indices_) - Xt, yt = Xt.iloc[samples], yt.iloc[samples] # type: ignore[call-overload] + samples = np.asarray(sorted(self._estimator.sample_indices_)) + Xt, yt = Xt.iloc[samples], yt.iloc[samples] self._log_changes(yt) @@ -535,9 +587,9 @@ def transform(self, X: XConstructor, y: YConstructor) -> tuple[pd.DataFrame, pd. # Select rows kept by the undersampler if self._estimator.__class__.__name__ == "SMOTEENN": - samples = sorted(self._estimator.enn_.sample_indices_) + samples = np.asarray(sorted(self._estimator.enn_.sample_indices_)) elif self._estimator.__class__.__name__ == "SMOTETomek": - samples = sorted(self._estimator.tomek_.sample_indices_) + samples = np.asarray(sorted(self._estimator.tomek_.sample_indices_)) # Select the remaining samples from the old dataframe o_samples = [s for s in samples if s < len(Xt)] @@ -554,9 +606,9 @@ def transform(self, X: XConstructor, y: YConstructor) -> tuple[pd.DataFrame, pd. # Select the new samples and assign the new indices X_new = X_new.iloc[-len(X_new) + len(o_samples):] - X_new.index = n_idx + X_new.index = pd.Index(n_idx) y_new = y_new.iloc[-len(y_new) + len(o_samples):] - y_new.index = n_idx + y_new.index = pd.Index(n_idx) # First, output the samples created for key, value in self.mapping_.items(): @@ -752,6 +804,7 @@ def fit(self, X: XConstructor | None = None, y: YConstructor | None = None) -> S self._check_n_features(Xt, reset=True) self.mapping_: dict[str, Any] = {} + self.target_names_in_ = np.array([]) self._drop_cols = [] self._estimators = {} @@ -786,7 +839,9 @@ def fit(self, X: XConstructor | None = None, y: YConstructor | None = None) -> S elif list(uq := np.unique(col)) != list(range(col.nunique())): LabelEncoder = self._get_est_class("LabelEncoder", "preprocessing") self._estimators[col.name] = LabelEncoder().fit(col) - self.mapping_.update({col.name: {str(it(v)): i for i, v in enumerate(uq)}}) + self.mapping_.update( + {str(col.name): {str(it(v)): i for i, v in enumerate(uq)}} + ) return self @@ -820,7 +875,7 @@ def transform( self, X: XConstructor | None = None, y: YConstructor | None = None, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Apply the data cleaning steps to the data. Parameters @@ -844,11 +899,7 @@ def transform( check_is_fitted(self) Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) - yt = to_tabular( - data=y, - index=getattr(Xt, "index", None), - columns=getattr(self, "target_names_in_", None), - ) + yt = to_tabular(y, index=getattr(Xt, "index", None), columns=self.target_names_in_) self._log("Cleaning the data...", 1) @@ -907,8 +958,7 @@ def transform( if est := self._estimators.get(col.name): if n_cols(out := est.transform(col)) == 1: self._log(f" --> Label-encoding column {col.name}.", 2) - out = to_series(out, yt.index, col.name) - + out = to_series(out, yt.index, str(col.name)) else: self._log(f" --> Label-binarizing column {col.name}.", 2) out = to_df( @@ -937,7 +987,7 @@ def inverse_transform( self, X: XConstructor | None = None, y: YConstructor | None = None, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Inversely transform the label encoding. This method only inversely transforms the target encoding. @@ -1166,7 +1216,6 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: goal=Goal.forecast, **{x: getattr(self, x) for x in BaseTransformer.attrs if hasattr(self, x)}, ) - model.task = Goal.forecast.infer_task(y) forecaster = model._get_est({}) else: raise ValueError( @@ -1209,7 +1258,7 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: return self - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Decompose the data. Parameters @@ -1237,7 +1286,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr return self._convert(Xt) - def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Inversely transform the data. Parameters @@ -1484,8 +1533,8 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: self._check_feature_names(Xt, reset=True) self._check_n_features(Xt, reset=True) - self._estimators: dict[str, Estimator] = {} - self._labels: dict[str, Sequence[str]] = {} + self._estimators: dict[Hashable, Estimator] = {} + self._labels: dict[Hashable, Sequence[str]] = {} self._log("Fitting Discretizer...", 1) @@ -1493,7 +1542,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: # Assign bins per column if isinstance(self.bins, dict): if col in self.bins: - bins_c = self.bins[col] + bins_c = self.bins[str(col)] else: continue # Ignore existing column not specified in dict else: @@ -1507,7 +1556,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: raise ValueError( "Invalid value for the bins parameter. The length of the " "bins does not match the length of the columns, got len" - f"(bins)={len(bins_c)} and len(columns)={X.shape[1]}." + f"(bins)={len(bins_c)} and len(columns)={Xt.shape[1]}." ) from None else: bins_x = bins_c @@ -1529,7 +1578,7 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: # Save labels for transform method self._labels[col] = get_labels( - col=col, + col=str(col), bins=self._estimators[col].bin_edges_[0], ) @@ -1550,12 +1599,12 @@ def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]: # Make of cut a transformer self._estimators[col] = FunctionTransformer( func=pd.cut, - kw_args={"bins": bins_c, "labels": get_labels(col, bins_c)}, + kw_args={"bins": bins_c, "labels": get_labels(str(col), bins_c)}, ).fit(Xt[[col]]) return self - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Bin the data into intervals. Parameters @@ -1817,7 +1866,7 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: if self.infrequent_to_value: if self.infrequent_to_value < 1: - infrequent_to_value = int(self.infrequent_to_value * len(X)) + infrequent_to_value = int(self.infrequent_to_value * len(Xt)) else: infrequent_to_value = int(self.infrequent_to_value) @@ -1916,7 +1965,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> return get_col_order(cols, self.feature_names_in_, self._estimator.feature_names_in_) - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Encode the data. Parameters @@ -2211,7 +2260,7 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: elif self.strat_num == "drop": num_imputer = "passthrough" else: - num_imputer = make_sklearn(Imputer)( + num_imputer = make_sklearn(SktimeImputer)( method=self.strat_num, missing_values=[pd.NA], random_state=self.random_state, @@ -2275,7 +2324,7 @@ def transform( self, X: XConstructor, y: YConstructor | None = None, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Impute the missing values. Note that leaving y=None can lead to inconsistencies in @@ -2594,7 +2643,7 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: return self - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Apply the transformations to the data. Parameters @@ -2621,7 +2670,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr return self._convert(Xt) - def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Apply the inverse transformation to the data. Parameters @@ -2644,13 +2693,9 @@ def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> p self._log("Inversely normalizing features...", 1) - Xt.update( - to_df( - data=self._estimator.inverse_transform(Xt[self._estimator.feature_names_in_]), - index=Xt.index, - columns=self._estimator.feature_names_in_, - ) - ) + out: np.ndarray = self._estimator.inverse_transform(Xt[self._estimator.feature_names_in_]) + + Xt.update(to_df(out, index=Xt.index, columns=self._estimator.feature_names_in_)) return self._convert(Xt) @@ -2809,7 +2854,7 @@ def transform( self, X: XConstructor, y: YConstructor | None = None, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Apply the outlier strategy on the data. Parameters @@ -2829,8 +2874,8 @@ def transform( Transformed target column. Only returned if provided. """ - Xt = to_df(X) - yt = to_series(y, index=Xt.index) + Xt = to_df(X, columns=getattr(self, "feature_names_in_", None)) + yt = to_tabular(y, index=Xt.index) # Estimators with their modules strategies = { @@ -2932,13 +2977,13 @@ def transform( if outliers: # Select outliers from intersection of strategies - mask = [any(strats) for strats in zip(*outliers, strict=True)] - self._log(f" --> Dropping {len(mask) - sum(mask)} outliers.", 2) + outlier_rows = [any(strats) for strats in zip(*outliers, strict=True)] + self._log(f" --> Dropping {len(outlier_rows) - sum(outlier_rows)} outliers.", 2) # Keep only the non-outliers from the data - Xt = Xt[mask] + Xt = Xt[outlier_rows] if yt is not None: - yt = yt[mask] + yt = yt[outlier_rows] else: # Replace the columns in X and y with the new values from objective @@ -3115,7 +3160,7 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: return self - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Perform standardization by centering and scaling. Parameters @@ -3142,7 +3187,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr return self._convert(Xt) - def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Apply the inverse transformation to the data. Parameters @@ -3165,12 +3210,8 @@ def inverse_transform(self, X: XConstructor, y: YConstructor | None = None) -> p self._log("Inversely scaling features...", 1) - Xt.update( - to_df( - data=self._estimator.inverse_transform(Xt[self._estimator.feature_names_in_]), - index=Xt.index, - columns=self._estimator.feature_names_in_, - ) - ) + out: np.ndarray = self._estimator.inverse_transform(Xt[self._estimator.feature_names_in_]) + + Xt.update(to_df(out, index=Xt.index, columns=self._estimator.feature_names_in_)) return self._convert(Xt) diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index ccd15d94b..c333fd86b 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -33,10 +33,10 @@ from atom.basetransformer import BaseTransformer from atom.data_cleaning import Scaler, TransformerMixin from atom.utils.types import ( - Bool, Engine, FeatureSelectionSolvers, - FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, - FloatZeroToOneInc, IntLargerEqualZero, IntLargerZero, NJobs, Operators, - Scalar, Sequence, Verbose, XConstructor, YConstructor, + Bool, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, + FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, + IntLargerEqualZero, IntLargerZero, NJobs, Operators, Scalar, Sequence, + Verbose, XConstructor, YConstructor, XReturn ) from atom.utils.utils import ( Goal, Task, check_is_fitted, check_scaling, get_custom_scorer, is_sparse, @@ -172,7 +172,7 @@ def __init__( self.drop_columns = drop_columns self.from_index = from_index - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Extract the new features. Parameters @@ -208,7 +208,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr col_dt = pd.to_datetime( arg=column, errors="coerce", # Converts to NaT if he can't format - format=self.fmt.get(name) if isinstance(self.fmt, dict) else self.fmt, + format=self.fmt.get(str(name)) if isinstance(self.fmt, dict) else self.fmt, ) # If >30% values are NaT, the conversion was unsuccessful @@ -252,7 +252,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr min_val, max_val = 1, col_dt.dt.daysinmonth elif fx in ("dayofyear", "day_of_year"): min_val = 1 - max_val = [365 if i else 366 for i in col_dt.dt.is_leap_year] + max_val = pd.Series([365 if i else 366 for i in col_dt.dt.is_leap_year]) elif fx == "month": min_val, max_val = 1, 12 elif fx == "quarter": @@ -262,18 +262,18 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr if self.encoding_type == "ordinal" or max_val is None: self._log(f" --> Creating feature {new_name}.", 2) X_new[new_name] = series.to_numpy() - order.insert(order.index(name) + 1, new_name) + order.insert(order.index(str(name)) + 1, new_name) elif self.encoding_type == "cyclic": self._log(f" --> Creating cyclic feature {new_name}.", 2) pos = 2 * np.pi * (series.to_numpy() - min_val) / np.array(max_val) X_new[f"{new_name}_sin"] = np.sin(pos) X_new[f"{new_name}_cos"] = np.cos(pos) - order.insert(order.index(name) + 1, f"{new_name}_sin") - order.insert(order.index(name) + 2, f"{new_name}_cos") + order.insert(order.index(str(name)) + 1, f"{new_name}_sin") + order.insert(order.index(str(name)) + 2, f"{new_name}_cos") # Drop the original column if self.drop_columns or self.from_index: - order.remove(name) + order.remove(str(name)) return self._convert(merge(X_new, Xt)[order]) @@ -495,7 +495,7 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: init_depth=kwargs.pop("init_depth", (1, 2)), const_range=kwargs.pop("const_range", None), function_set=operators, - feature_names=X.columns, + feature_names=Xt.columns, verbose=kwargs.pop("verbose", 0 if self.verbose < 2 else 1), n_jobs=kwargs.pop("n_jobs", self.n_jobs), random_state=kwargs.pop("random_state", self.random_state), @@ -504,7 +504,7 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: return self - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Generate new features. Parameters @@ -568,7 +568,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr counter = 0 while True: name = f"x{Xt.shape[1] + counter}" - if name not in X: + if name not in Xt: Xt[name] = array # Add new feature to X df.iloc[i, 0] = name break @@ -674,7 +674,7 @@ def __init__( self.operators = operators self.drop_columns = drop_columns - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Group features. Parameters @@ -1028,14 +1028,6 @@ def fit(self, X: XConstructor, y: YConstructor | None = None) -> Self: """ from atom.models import MODELS - def check_y(): - """For some strategies, y needs to be provided.""" - if y is None: - raise ValueError( - "Invalid value for the y parameter. Value cannot " - f"be None for strategy='{self.strategy}'." - ) - def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): """Objective function for the advanced optimization strategies.""" if X_train.equals(X_valid): @@ -1048,6 +1040,12 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): Xt = to_df(X) yt = to_tabular(y, index=Xt.index) + if yt is None and self.strategy != "pca": + raise ValueError( + "Invalid value for the y parameter. Value cannot " + f"be None for strategy='{self.strategy}'." + ) + self._check_feature_names(Xt, reset=True) self._check_n_features(Xt, reset=True) @@ -1058,7 +1056,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): self._high_variance: dict[Hashable, tuple[Hashable, int]] = {} self._low_variance: dict[Hashable, tuple[Hashable, float]] = {} self._estimator: Any = None - self._n_features = None + self._n_features: int | None = None if isinstance(self.strategy, str): if self.strategy not in ("univariate", "pca"): @@ -1094,7 +1092,8 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): if hasattr(self, x) }, ) - model.task = goal.infer_task(y) + if yt is not None: + model.task = goal.infer_task(yt) solver = model._get_est({}) else: raise ValueError( @@ -1125,13 +1124,13 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): elif self.n_features < 1: self._n_features = int(self.n_features * Xt.shape[1]) else: - self._n_features = self.n_features + self._n_features = int(self.n_features) min_repeated: Scalar if self.min_repeated is None: min_repeated = 1 elif self.min_repeated <= 1: - min_repeated = self.min_repeated * len(X) + min_repeated = self.min_repeated * len(Xt) else: min_repeated = int(self.min_repeated) @@ -1242,14 +1241,13 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): else: solver = self.solver - check_y() self._estimator = SelectKBest(solver, k=self._n_features).fit(Xt, yt) elif self.strategy == "pca": if not is_sparse(Xt): # PCA requires the features to be scaled if not check_scaling(Xt): - self.scaler_ = Scaler() + self.scaler_ = Scaler(device=self.device, engine=self.engine) Xt = self.scaler_.fit_transform(Xt) estimator = self._get_est_class("PCA", "decomposition") @@ -1300,13 +1298,10 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ) self._estimator.estimator_ = solver else: - check_y() self._estimator.fit(Xt, yt) elif self.strategy in ("sfs", "rfe", "rfecv"): if self.strategy == "sfs": - check_y() - if self.kwargs.get("scoring"): kwargs["scoring"] = get_custom_scorer(self.kwargs["scoring"]) @@ -1318,8 +1313,6 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ) elif self.strategy == "rfe": - check_y() - self._estimator = RFE( estimator=solver, n_features_to_select=self._n_features, @@ -1327,13 +1320,11 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): ) elif self.strategy == "rfecv": - check_y() - if self.kwargs.get("scoring"): kwargs["scoring"] = get_custom_scorer(self.kwargs["scoring"]) # Invert n_features to select them all (default option) - if self._n_features == X.shape[1]: + if self._n_features == Xt.shape[1]: self._n_features = 1 self._estimator = RFECV( @@ -1346,7 +1337,6 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): self._estimator.fit(Xt, yt) else: - check_y() strategies = { "pso": ParticleSwarmOptimization, "hho": HarrisHawkOptimization, @@ -1375,7 +1365,8 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): kwargs["scoring"] = get_custom_scorer(kwargs["scoring"]) else: goal = Goal(0) if is_classifier(solver) else Goal(1) - task = goal.infer_task(yt) + if yt is not None: + task = goal.infer_task(yt) if task is Task.binary_classification: kwargs["scoring"] = get_custom_scorer("f1") elif task.is_multiclass: @@ -1447,7 +1438,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> ] ) - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Transform the data. Parameters @@ -1475,7 +1466,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr self._log( f" --> Feature {fx} was removed due to high variance. " f"Value {h_variance[0]} was the most repeated value with " - f"{h_variance[1]} ({h_variance[1] / len(X):.1f}%) occurrences.", 2, + f"{h_variance[1]} ({h_variance[1] / len(Xt):.1f}%) occurrences.", 2, ) Xt = Xt.drop(columns=fx) @@ -1504,7 +1495,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr f" --> The univariate test selected " f"{self._n_features} features from the dataset.", 2, ) - for n, column in enumerate(X): + for n, column in enumerate(Xt): if not self.univariate_.get_support()[n]: self._log( f" --> Dropping feature {column} " @@ -1530,7 +1521,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr mask = self._estimator.get_support() self._log(f" --> {self.strategy} selected {sum(mask)} features from the dataset.", 2) - for n, column in enumerate(X): + for n, column in enumerate(Xt): if not mask[n]: if hasattr(self._estimator, "ranking_"): self._log( @@ -1547,7 +1538,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr "features from the dataset.", 2, ) - for column in X: + for column in Xt: if column not in self._estimator.best_feature_list: self._log(f" --> Dropping feature {column}.", 2) Xt = Xt.drop(columns=column) diff --git a/atom/nlp.py b/atom/nlp.py index 29aedd362..751ea637c 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -21,8 +21,8 @@ from atom.data_cleaning import TransformerMixin from atom.utils.types import ( - Bool, Engine, FloatLargerZero, Sequence, - VectorizerStarts, Verbose, XConstructor, YConstructor, bool_t, + Bool, Engine, FloatLargerZero, Sequence, VectorizerStarts, Verbose, + XConstructor, YConstructor, bool_t, XReturn ) from atom.utils.utils import ( check_is_fitted, check_nltk_module, get_corpus, is_sparse, merge, to_df, @@ -189,7 +189,7 @@ def __init__( self.regex_number = regex_number self.drop_punctuation = drop_punctuation - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Apply the transformations to the data. Parameters @@ -440,7 +440,7 @@ def __init__( self.stem = stem self.lemmatize = lemmatize - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Normalize the text. Parameters @@ -664,7 +664,7 @@ def __init__( self.trigram_freq = trigram_freq self.quadgram_freq = quadgram_freq - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Tokenize the text. Parameters @@ -988,7 +988,7 @@ def get_feature_names_out(self, input_features: Sequence[str] | None = None) -> og_columns = [c for c in self.feature_names_in_ if c != self._corpus] return np.array(og_columns + self._get_corpus_columns()) - def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFrame: + def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: """Vectorize the text. Parameters @@ -1026,7 +1026,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> pd.DataFr if not self.return_sparse: self._log(" --> Converting the output to a full array.", 2) matrix = matrix.toarray() - elif not Xt.empty and not is_sparse(X): + elif not Xt.empty and not is_sparse(Xt): # Raise if there are other columns that are non-sparse raise ValueError( "Invalid value for the return_sparse parameter. The value must " diff --git a/atom/pipeline.py b/atom/pipeline.py index 5af808dda..f14686485 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -9,7 +9,7 @@ from collections.abc import Iterator from itertools import islice -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload import numpy as np import pandas as pd @@ -27,11 +27,11 @@ from atom.utils.types import ( Bool, EngineDataOptions, EngineTuple, Estimator, FHConstructor, Float, - Pandas, Scalar, Sequence, Verbose, XConstructor, YConstructor, YReturn, + Pandas, Scalar, Sequence, Verbose, XConstructor, YConstructor, YReturn, XReturn ) from atom.utils.utils import ( NotFittedError, adjust, check_is_fitted, fit_one, fit_transform_one, - transform_one, variable_return, + transform_one, variable_return, to_df, to_tabular ) @@ -39,6 +39,9 @@ from sktime.proba.normal import Normal +T = TypeVar("T") + + class Pipeline(SkPipeline): """Pipeline of transforms with a final estimator. @@ -226,6 +229,15 @@ def _can_inverse_transform(self) -> bool: for _, _, est in self._iter() ) + @overload + def _convert(self, obj: Literal[None]) -> None: ... + + @overload + def _convert(self, obj: pd.DataFrame) -> XReturn: ... + + @overload + def _convert(self, obj: pd.Series) -> YReturn: ... + def _convert(self, obj: Pandas | None) -> YReturn | None: """Convert data to the type set in the data engine. @@ -325,6 +337,9 @@ def _fit( self.steps: list[tuple[str, Estimator]] = list(self.steps) self._validate_steps() + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + for step, name, transformer in self._iter( with_final=False, filter_passthrough=False, filter_train_only=False ): @@ -347,10 +362,10 @@ def _fit( # Fit or load the current estimator from cache # Type ignore because routed_params is never None but # the signature of _fit needs to comply with sklearn's - X, y, fitted_transformer = self._mem_fit_transform( + Xt, yt, fitted_transformer = self._mem_fit_transform( transformer=cloned, - X=X, - y=y, + X=Xt, + y=yt, message=self._log_message(step), **routed_params[name].fit_transform, # type: ignore[index] ) @@ -359,7 +374,7 @@ def _fit( # estimator (necessary when loading from cache) self.steps[step] = (name, fitted_transformer) - return X, y + return Xt, yt def get_metadata_routing(self): """Get metadata routing of this object. @@ -470,15 +485,15 @@ def fit( """ routed_params = self._check_method_params(method="fit", props=params) - X, y = self._fit(X, y, routed_params) + Xt, yt = self._fit(X, y, routed_params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator is not None and self._final_estimator != "passthrough": - with adjust(self._final_estimator, self._verbose): + with adjust(self._final_estimator, verbose=self._verbose): self._mem_fit( estimator=self._final_estimator, - X=X, - y=y, + X=Xt, + y=yt, **routed_params[self.steps[-1][0]].fit, ) @@ -490,7 +505,7 @@ def fit_transform( X: XConstructor | None = None, y: YConstructor | None = None, **params, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Fit the pipeline and transform the data. Call `fit` followed by `transform` on each transformer in the @@ -525,21 +540,21 @@ def fit_transform( """ routed_params = self._check_method_params(method="fit_transform", props=params) - X, y = self._fit(X, y, routed_params) + Xt, yt = self._fit(X, y, routed_params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator is None or self._final_estimator == "passthrough": - return variable_return(X, y) + return variable_return(Xt, yt) with adjust(self._final_estimator, verbose=self._verbose): - X, y, _ = self._mem_fit_transform( + Xt, yt, _ = self._mem_fit_transform( transformer=self._final_estimator, - X=X, - y=y, + X=Xt, + y=yt, **routed_params[self.steps[-1][0]].fit_transform, ) - return variable_return(self._convert(X), self._convert(y)) + return variable_return(self._convert(Xt), self._convert(yt)) @available_if(_can_transform) def transform( @@ -549,7 +564,7 @@ def transform( *, filter_train_only: Bool = True, **params, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Transform the data. Call `transform` on each transformer in the pipeline. The @@ -589,19 +604,22 @@ def transform( if X is None and y is None: raise ValueError("X and y cannot be both None.") + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + _raise_for_params(params, self, "transform") routed_params = process_routing(self, "transform", **params) for _, name, transformer in self._iter(filter_train_only=filter_train_only): with adjust(transformer, verbose=self._verbose): - X, y = self._mem_transform( + Xt, yt = self._mem_transform( transformer=transformer, - X=X, - y=y, + X=Xt, + y=yt, **routed_params[name].transform, ) - return variable_return(self._convert(X), self._convert(y)) + return variable_return(self._convert(Xt), self._convert(yt)) @available_if(_can_inverse_transform) def inverse_transform( @@ -611,7 +629,7 @@ def inverse_transform( *, filter_train_only: Bool = True, **params, - ) -> Pandas | tuple[pd.DataFrame, Pandas]: + ) -> YReturn | tuple[XReturn, YReturn]: """Inverse transform for each step in a reverse order. All estimators in the pipeline must implement the @@ -647,21 +665,24 @@ def inverse_transform( if X is None and y is None: raise ValueError("X and y cannot be both None.") + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + _raise_for_params(params, self, "inverse_transform") routed_params = process_routing(self, "inverse_transform", **params) reverse_iter = reversed(list(self._iter(filter_train_only=filter_train_only))) for _, name, transformer in reverse_iter: with adjust(transformer, verbose=self._verbose): - X, y = self._mem_transform( + Xt, yt = self._mem_transform( transformer=transformer, - X=X, - y=y, + X=Xt, + y=yt, method="inverse_transform", **routed_params[name].inverse_transform, ) - return variable_return(self._convert(X), self._convert(y)) + return variable_return(self._convert(Xt), self._convert(yt)) @available_if(_final_estimator_has("decision_function")) def decision_function(self, X: XConstructor, **params) -> np.ndarray: @@ -686,20 +707,22 @@ def decision_function(self, X: XConstructor, **params) -> np.ndarray: multiclass classification tasks. """ + Xt = to_df(X) + _raise_for_params(params, self, "decision_function") routed_params = process_routing(self, "decision_function", **params) for _, name, transformer in self._iter(with_final=False): with adjust(transformer, verbose=self._verbose): - X, _ = self._mem_transform( + Xt, _ = self._mem_transform( transformer=transformer, - X=X, + X=Xt, **routed_params.get(name, {}).get("transform", {}), ) return self.steps[-1][1].decision_function( - X, **routed_params.get(self.steps[-1][0], {}).get("decision_function", {}) + Xt, **routed_params.get(self.steps[-1][0], {}).get("decision_function", {}) ) @available_if(_final_estimator_has("predict")) @@ -740,19 +763,21 @@ def predict( if X is None and fh is None: raise ValueError("X and fh cannot be both None.") + Xt = to_df(X) + routed_params = process_routing(self, "predict", **params) for _, name, transformer in self._iter(with_final=False): with adjust(transformer, verbose=self._verbose): - X, _ = self._mem_transform(transformer, X, **routed_params[name].transform) + Xt, _ = self._mem_transform(transformer, Xt, **routed_params[name].transform) if isinstance(self._final_estimator, BaseForecaster): if fh is None: raise ValueError("The fh parameter cannot be None for forecasting estimators.") - return self.steps[-1][1].predict(fh=fh, X=X) + return self.steps[-1][1].predict(fh=fh, X=Xt) else: - return self.steps[-1][1].predict(X, **routed_params[self.steps[-1][0]].predict) + return self.steps[-1][1].predict(Xt, **routed_params[self.steps[-1][0]].predict) @available_if(_final_estimator_has("predict_interval")) def predict_interval( @@ -782,11 +807,13 @@ def predict_interval( Computed interval forecasts. """ + Xt = to_df(X) + for _, _, transformer in self._iter(with_final=False): with adjust(transformer, verbose=self._verbose): - X, y = self._mem_transform(transformer, X) + Xt, _ = self._mem_transform(transformer, Xt) - return self.steps[-1][1].predict_interval(fh=fh, X=X, coverage=coverage) + return self.steps[-1][1].predict_interval(fh=fh, X=Xt, coverage=coverage) @available_if(_final_estimator_has("predict_log_proba")) def predict_log_proba(self, X: XConstructor, **params) -> np.ndarray: @@ -809,14 +836,16 @@ def predict_log_proba(self, X: XConstructor, **params) -> np.ndarray: n_classes) or a list of arrays for [multioutput tasks][]. """ + Xt = to_df(X) + routed_params = process_routing(self, "predict_log_proba", **params) for _, name, transformer in self._iter(with_final=False): with adjust(transformer, verbose=self._verbose): - X, _ = self._mem_transform(transformer, X, **routed_params[name].transform) + Xt, _ = self._mem_transform(transformer, Xt, **routed_params[name].transform) return self.steps[-1][1].predict_log_proba( - X, **routed_params[self.steps[-1][0]].predict_log_proba + Xt, **routed_params[self.steps[-1][0]].predict_log_proba ) @available_if(_final_estimator_has("predict_proba")) @@ -863,20 +892,22 @@ def predict_proba( if X is None and fh is None: raise ValueError("X and fh cannot be both None.") + Xt = to_df(X) + routed_params = process_routing(self, "predict_proba", **params) for _, name, transformer in self._iter(with_final=False): with adjust(transformer, verbose=self._verbose): - X, _ = self._mem_transform(transformer, X, **routed_params[name].transform) + Xt, _ = self._mem_transform(transformer, Xt, **routed_params[name].transform) if isinstance(self._final_estimator, BaseForecaster): if fh is None: raise ValueError("The fh parameter cannot be None for forecasting estimators.") - return self.steps[-1][1].predict_proba(fh=fh, X=X, marginal=marginal) + return self.steps[-1][1].predict_proba(fh=fh, X=Xt, marginal=marginal) else: return self.steps[-1][1].predict_proba( - X, **routed_params[self.steps[-1][0]].predict_proba + Xt, **routed_params[self.steps[-1][0]].predict_proba ) @available_if(_final_estimator_has("predict_quantiles")) @@ -908,11 +939,13 @@ def predict_quantiles( Computed quantile forecasts. """ + Xt = to_df(X) + for _, _, transformer in self._iter(with_final=False): with adjust(transformer, verbose=self._verbose): - X, y = self._mem_transform(transformer, X) + Xt, _ = self._mem_transform(transformer, Xt) - return self.steps[-1][1].predict_quantiles(fh=fh, X=X, alpha=alpha) + return self.steps[-1][1].predict_quantiles(fh=fh, X=Xt, alpha=alpha) @available_if(_final_estimator_has("predict_residuals")) def predict_residuals( @@ -937,11 +970,14 @@ def predict_residuals( n_targets) for [multivariate][] tasks. """ + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + for _, _, transformer in self._iter(with_final=False): with adjust(transformer, verbose=self._verbose): - X, y = self._mem_transform(transformer, X, y) + Xt, yt = self._mem_transform(transformer, Xt, yt) - return self.steps[-1][1].predict_residuals(y=y, X=X) + return self.steps[-1][1].predict_residuals(y=yt, X=Xt) @available_if(_final_estimator_has("predict_var")) def predict_var( @@ -972,11 +1008,13 @@ def predict_var( Computed variance forecasts. """ + Xt = to_df(X) + for _, _, transformer in self._iter(with_final=False): with adjust(transformer, verbose=self._verbose): - X, _ = self._mem_transform(transformer, X) + Xt, _ = self._mem_transform(transformer, Xt) - return self.steps[-1][1].predict_var(fh=fh, X=X, cov=cov) + return self.steps[-1][1].predict_var(fh=fh, X=Xt, cov=cov) def set_output(self, *, transform: EngineDataOptions | None = None) -> Self: """Set output container. @@ -1053,6 +1091,9 @@ def score( if X is None and y is None: raise ValueError("X and y cannot be both None.") + Xt = to_df(X) + yt = to_tabular(y, index=getattr(Xt, "index", None)) + # Drop sample weights if sktime estimator if not isinstance(self._final_estimator, BaseForecaster): params["sample_weight"] = sample_weight @@ -1061,9 +1102,9 @@ def score( for _, name, transformer in self._iter(with_final=False): with adjust(transformer, verbose=self._verbose): - X, y = self._mem_transform(transformer, X, y, **routed_params[name].transform) + Xt, yt = self._mem_transform(transformer, Xt, yt, **routed_params[name].transform) if isinstance(self._final_estimator, BaseForecaster): - return self.steps[-1][1].score(y=y, X=X, fh=fh) + return self.steps[-1][1].score(y=yt, X=Xt, fh=fh) else: - return self.steps[-1][1].score(X, y, **routed_params[self.steps[-1][0]].score) + return self.steps[-1][1].score(Xt, yt, **routed_params[self.steps[-1][0]].score) diff --git a/atom/utils/types.py b/atom/utils/types.py index 798d09c06..d7871824a 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -146,6 +146,8 @@ class SparseMatrix(Protocol): """ + def __len__(self) -> int: ... + def __iter__(self) -> Iterator: ... def _bsr_container(self): ... def _coo_container(self): ... def _csc_container(self): ... @@ -154,6 +156,9 @@ def _dia_container(self): ... def _dok_container(self): ... def _lil_container(self): ... + @property + def shape(self) -> tuple[int, int]: ... + @runtime_checkable class SkScorer(Protocol): @@ -235,8 +240,7 @@ def predict(self, *args, **kwargs) -> Pandas: ... # Types for X, y and fh XConstructor: TypeAlias = ( dict[str, Sequence[Any]] - | Sequence[Sequence[Any]] - | Iterable[Sequence[Any] | tuple[Hashable, Sequence[Any]] | dict[str, Sequence[Any]]] + | Sequence[Sequence[Any] | tuple[Hashable, Sequence[Any]]] | np.ndarray | SparseMatrix | pd.Series @@ -332,7 +336,11 @@ def predict(self, *args, **kwargs) -> Pandas: ... # Allowed values for method selection PredictionMethods: TypeAlias = Literal[ - "decision_function", "predict", "predict_log_proba", "predict_proba", "score" + "decision_function", + "predict", + "predict_log_proba", + "predict_proba", + "score", ] PredictionMethodsTS: TypeAlias = Literal[ "predict", @@ -365,7 +373,14 @@ def predict(self, *args, **kwargs) -> Pandas: ... # Others XDatasets: TypeAlias = Literal[ - "dataset", "train", "test", "holdout", "X", "X_train", "X_test", "X_holdout" + "dataset", + "train", + "test", + "holdout", + "X", + "X_train", + "X_test", + "X_holdout", ] YDatasets: TypeAlias = Literal["y", "y_train", "y_test", "y_holdout"] Seasonality: TypeAlias = IntLargerOne | str | Sequence[IntLargerOne | str] | None @@ -407,9 +422,7 @@ def predict(self, *args, **kwargs) -> Pandas: ... from pyspark.sql import DataFrame as SparkDataFrame XReturn: TypeAlias = ( - Sequence[Sequence[Any]] - | np.ndarray - | SparseMatrix + np.ndarray | pd.DataFrame | pl.DataFrame | pl.LazyFrame @@ -419,8 +432,7 @@ def predict(self, *args, **kwargs) -> Pandas: ... | SparkDataFrame ) YReturn: TypeAlias = ( - Sequence[Any] - | np.ndarray + np.ndarray | pd.Series | pl.Series | pa.Array diff --git a/atom/utils/utils.py b/atom/utils/utils.py index 61b4e37e9..c3adfbfc3 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -22,7 +22,7 @@ from inspect import Parameter, signature from itertools import cycle from types import GeneratorType, MappingProxyType -from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload +from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload, cast import numpy as np import pandas as pd @@ -46,8 +46,9 @@ Bool, EngineDataOptions, EngineTuple, Estimator, FeatureNamesOut, Float, IndexSelector, Int, IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar, Scorer, Segment, Sequence, SPTuple, Transformer, - Verbose, XConstructor, YConstructor, int_t, segment_t, sequence_t, + Verbose, XConstructor, YConstructor, int_t, segment_t, sequence_t, XReturn, YReturn ) +from pandas.core.generic import NDFrame if TYPE_CHECKING: @@ -61,7 +62,7 @@ T = TypeVar("T") -T_Pandas = TypeVar("T_Pandas", pd.Series, pd.DataFrame) +T_Pandas = TypeVar("T_Pandas", bound=NDFrame) T_Transformer = TypeVar("T_Transformer", bound=Transformer) T_Estimator = TypeVar("T_Estimator", bound=Estimator) @@ -624,7 +625,8 @@ def __init__(self, model: BaseModel, n_jobs: Int): def __call__(self, study: Study, trial: FrozenTrial): """Print trial info and store in mlflow experiment.""" try: # Fails when there are no successful trials - trial_info = self.T.trials.reset_index(names="trial").loc[trial.number] + trials = self.T.trials.reset_index(names="trial") + trial_info = cast(pd.Series, trials.loc[trial.number]) # Loc returns df or series except KeyError: return @@ -1379,8 +1381,8 @@ def get_nan(dtype: Dtype) -> float | NAType: ) else: return X.replace( - to_replace={k: (missing_values or []) + default_values for k in X}, - value={k: get_nan(X[k].dtype) for k in X}, + to_replace={c: (missing_values or []) + default_values for c in X.columns}, + value={c: get_nan(d) for c, d in X.dtypes.items()}, ) @@ -1398,8 +1400,8 @@ def n_cols(obj: YConstructor | None) -> int: Number of columns. """ - if obj is not None and hasattr(obj, "shape"): - return obj.shape[1] if len(obj.shape) > 1 else 1 + if hasattr(obj, "shape"): + return obj.shape[1] if len(obj.shape) > 1 else 1 # type: ignore[union-attr] elif isinstance(obj, dict): return 2 # Dict always goes to dataframe @@ -1457,9 +1459,9 @@ def get_col_names(obj: Any) -> list[str] | None: def variable_return( - X: pd.DataFrame | None, - y: Pandas | None, -) -> Pandas | tuple[pd.DataFrame, Pandas]: + X: XReturn | None, + y: YReturn | None, +) -> XReturn | tuple[XReturn, YReturn]: """Return one or two arguments depending on which is None. This utility is used to make methods return only the provided @@ -1467,15 +1469,15 @@ def variable_return( Parameters ---------- - X: pd.DataFrame or None + X: dataframe or None Feature set. - y: pd.Series, pd.DataFrame or None + y: series, dataframe or None Target column(s). Returns ------- - pd.Series, pd.DataFrame or tuple + series, dataframe or tuple Data sets that are not None. """ @@ -2184,7 +2186,7 @@ def name_cols( lambda c: np.array_equal( a1=c, a2=column, - equal_nan=is_numeric_dtype(c) and np.issubdtype(column.dtype, np.number), + equal_nan=is_numeric_dtype(c) and np.issubdtype(column.dtype.name, np.number), ) ) @@ -2316,8 +2318,8 @@ def reorder_cols( def fit_one( estimator: Estimator, - X: XConstructor | None = None, - y: YConstructor | None = None, + X: pd.DataFrame | None = None, + y: Pandas | None = None, message: str | None = None, **fit_params, ) -> Estimator: @@ -2328,11 +2330,11 @@ def fit_one( estimator: Estimator Instance to fit. - X: dataframe-like or None, default=None + X: pd.DataFrame or None, default=None Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: sequence, pd.DataFrame-like or None, default=None + y: pd.Series, pd.DataFrame or None, default=None Target column(s) corresponding to `X`. message: str or None @@ -2347,30 +2349,27 @@ def fit_one( Fitted estimator. """ - Xt = to_df(X) - yt = to_tabular(y, index=getattr(Xt, "index", None)) - with _print_elapsed_time("Pipeline", message): if hasattr(estimator, "fit"): kwargs: dict[str, Pandas] = {} - inc = getattr(estimator, "_cols", getattr(Xt, "columns", [])) + inc = getattr(estimator, "_cols", getattr(X, "columns", [])) if "X" in (params := sign(estimator.fit)): - if Xt is not None and (cols := [c for c in inc if c in Xt]): - kwargs["X"] = Xt[cols] + if X is not None and (cols := [c for c in inc if c in X]): + kwargs["X"] = X[cols] # X is required but has not been provided if len(kwargs) == 0: - if yt is not None and hasattr(estimator, "_cols"): - kwargs["X"] = to_df(yt)[inc] + if y is not None and hasattr(estimator, "_cols"): + kwargs["X"] = to_df(y)[inc] elif params["X"].default != Parameter.empty: kwargs["X"] = params["X"].default # Fill X with default - elif Xt is None: + elif X is None: raise ValueError( "Exception while trying to fit transformer " f"{estimator.__class__.__name__}. Parameter " "X is required but has not been provided." ) - elif Xt.empty: + elif X.empty: raise ValueError( "Exception while trying to fit transformer " f"{estimator.__class__.__name__}. Parameter X is " @@ -2379,8 +2378,8 @@ def fit_one( "target column, e.g., atom.decompose(columns=-1)." ) - if "y" in params and yt is not None: - kwargs["y"] = yt + if "y" in params and y is not None: + kwargs["y"] = y # Keep custom attrs since some transformers reset during fit with keep_attrs(estimator): @@ -2391,8 +2390,8 @@ def fit_one( def transform_one( transformer: Transformer, - X: XConstructor | None = None, - y: YConstructor | None = None, + X: pd.DataFrame | None = None, + y: Pandas | None = None, method: Literal["transform", "inverse_transform"] = "transform", **transform_params, ) -> tuple[pd.DataFrame | None, Pandas | None]: @@ -2403,11 +2402,11 @@ def transform_one( transformer: Transformer Instance to fit. - X: dataframe-like or None, default=None + X: pd.DataFrame or None, default=None Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: sequence, pd.DataFrame-like or None, default=None + y: pd.Series, pd.DataFrame or None, default=None Target column(s) corresponding to `X`. method: str, default="transform" @@ -2459,61 +2458,58 @@ def prepare_df(out: XConstructor, og: pd.DataFrame) -> pd.DataFrame: else: return out_c - Xt = to_df(X) - yt = to_tabular(y, index=getattr(Xt, "index", None)) - use_y = True kwargs: dict[str, Any] = {} - inc = list(getattr(transformer, "_cols", getattr(Xt, "columns", []))) + inc = list(getattr(transformer, "_cols", getattr(X, "columns", []))) if "X" in (params := sign(getattr(transformer, method))): - if Xt is not None and (cols := [c for c in inc if c in Xt]): - kwargs["X"] = Xt[cols] + if X is not None and (cols := [c for c in inc if c in X]): + kwargs["X"] = X[cols] # X is required but has not been provided if len(kwargs) == 0: - if yt is not None and hasattr(transformer, "_cols"): - kwargs["X"] = to_df(yt)[inc] + if y is not None and hasattr(transformer, "_cols"): + kwargs["X"] = to_df(y)[inc] use_y = False elif params["X"].default != Parameter.empty: kwargs["X"] = params["X"].default # Fill X with default else: - return Xt, yt # If X is needed, skip the transformer + return X, y # If X is needed, skip the transformer if "y" in params: # We skip `y` when already added to `X` - if yt is not None and use_y: - kwargs["y"] = yt + if y is not None and use_y: + kwargs["y"] = y elif "X" not in params: - return Xt, yt # If y is None and no X in transformer, skip the transformer + return X, y # If y is None and no X in transformer, skip the transformer out: YConstructor | tuple[XConstructor, YConstructor] = getattr(transformer, method)(**kwargs, **transform_params) # Transform can return X, y or both X_new: pd.DataFrame | None y_new: Pandas | None - if isinstance(out, tuple) and Xt is not None: - X_new = prepare_df(out[0], Xt) + if isinstance(out, tuple) and X is not None: + X_new = prepare_df(out[0], X) y_new = to_tabular(out[1], index=X_new.index) - if isinstance(yt, pd.DataFrame) and isinstance(y_new, pd.DataFrame): - y_new = prepare_df(y_new, yt) - elif "X" in params and Xt is not None and any(c in Xt for c in inc): + if isinstance(y, pd.DataFrame) and isinstance(y_new, pd.DataFrame): + y_new = prepare_df(y_new, y) + elif "X" in params and X is not None and any(c in X for c in inc): # X in -> X out - X_new = prepare_df(out, Xt) # type: ignore[arg-type] - y_new = yt if yt is None else yt.set_axis(X_new.index, axis=0) + X_new = prepare_df(out, X) # type: ignore[arg-type] + y_new = y if y is None else y.set_axis(X_new.index, axis=0) elif y is not None: y_new = to_tabular(out) - X_new = Xt if Xt is None else Xt.set_index(y_new.index) - if isinstance(yt, pd.DataFrame) and isinstance(y_new, pd.DataFrame): - y_new = prepare_df(y_new, yt) + X_new = X if X is None else X.set_index(y_new.index) + if isinstance(y, pd.DataFrame) and isinstance(y_new, pd.DataFrame): + y_new = prepare_df(y_new, y) return X_new, y_new def fit_transform_one( transformer: Transformer, - X: XConstructor | None, - y: YConstructor | None, + X: pd.DataFrame | None, + y: Pandas | None, message: str | None = None, **fit_params, ) -> tuple[pd.DataFrame | None, Pandas | None, Transformer]: @@ -2526,11 +2522,11 @@ def fit_transform_one( transformer: Transformer Instance to fit. - X: dataframe-like or None + X: pd.DataFrame or None Feature set with shape=(n_samples, n_features). If None, `X` is ignored. - y: sequence, pd.DataFrame-like or None + y: pd.Series, pd.DataFrame or None Target column(s) corresponding to `X`. message: str or None, default=None @@ -2552,9 +2548,9 @@ def fit_transform_one( """ fit_one(transformer, X, y, message, **fit_params) - X, y = transform_one(transformer, X, y) + Xt, yt = transform_one(transformer, X, y) - return X, y, transformer + return Xt, yt, transformer # Decorators ======================================================= >> diff --git a/pyproject.toml b/pyproject.toml index ae2d50a20..866f05ed6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -193,4 +193,5 @@ ignore_missing_imports = true disable_error_code = [ "attr-defined", "abstract", # See https://github.com/python/mypy/issues/4717 + "override", # Transformers' methods don't always match with that of TransformerMixin ] diff --git a/tests/conftest.py b/tests/conftest.py index 661cd51e5..c416ad7e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,7 +33,7 @@ from _pytest.monkeypatch import MonkeyPatch - from atom.utils.types import DataFrame, Pandas, Sequence, XSelector + from atom.utils.types import DataFrame, Pandas, Sequence, XSelector, XConstructor class DummyTransformer(TransformerMixin, BaseEstimator): @@ -128,7 +128,7 @@ def random(): def get_train_test( - X: XSelector | None, + X: XConstructor | None, y: Sequence[Any] | pd.DataFrame, ) -> Pandas | tuple[Pandas, Pandas]: """Get train and test sets from X and y. diff --git a/tests/test_data.py b/tests/test_data.py index 2cdf2c155..9d36db453 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -741,11 +741,12 @@ def test_pyarrow_engine(): assert isinstance(atom.y, pa.Array) +@patch.dict("sys.modules", {"modin": MagicMock(spec=["__spec__", "pandas"])}) def test_modin_engine(): """Assert that the modin engine returns modin types.""" atom = ATOMClassifier(X_bin, y_bin, engine="modin", random_state=1) - assert isinstance(atom.X, md.DataFrame) - assert isinstance(atom.y, md.Series) + assert "DataFrame" in str(atom.X) + assert "Series" in str(atom.y) def test_dask_engine(): @@ -755,7 +756,7 @@ def test_dask_engine(): assert isinstance(atom.y, dd.Series) -@patch.dict("sys.modules", {"pyspark": MagicMock(spec=["__spec__", "sql"])}) +@patch.dict("sys.modules", {"pyspark.sql": MagicMock(spec=["__spec__", "SparkSession"])}) def test_pyspark_engine(): """Assert that the pyspark engine returns pyspark types.""" atom = ATOMClassifier(X_bin, y_bin, engine="pyspark", random_state=1) From c3d9c6b184d92ea953bfaf937e5d77975341742a Mon Sep 17 00:00:00 2001 From: Mavs Date: Sun, 25 Feb 2024 20:53:26 +0100 Subject: [PATCH 11/12] fixing type hints 2 --- atom/baserunner.py | 7 +++---- atom/basetransformer.py | 5 +++-- atom/data/dataengines.py | 2 +- atom/data_cleaning.py | 8 ++++---- atom/feature_engineering.py | 10 +++++----- atom/nlp.py | 2 +- atom/pipeline.py | 7 ++++--- atom/utils/types.py | 2 +- atom/utils/utils.py | 12 +++++++----- tests/conftest.py | 4 +++- tests/test_data.py | 1 - tests/test_data_cleaning.py | 4 ++-- 12 files changed, 34 insertions(+), 30 deletions(-) diff --git a/atom/baserunner.py b/atom/baserunner.py index 39541d169..bc128d132 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -626,12 +626,11 @@ def _no_data_sets( except ValueError as ex: # Clarify common error with stratification for multioutput tasks - if "least populated class" in str(ex) and isinstance(y, pd.DataFrame): + if isinstance(y, pd.DataFrame): raise ValueError( "Stratification for multioutput tasks is applied over all target " - "columns, which results in a least populated class that has only " - "one member. Either select only one column to stratify over, or " - "set the parameter stratify=False." + "columns. Either select only one column to stratify over, or set " + "the parameter stratify=False." ) from ex else: raise ex diff --git a/atom/basetransformer.py b/atom/basetransformer.py index 19a005439..5e2dfee29 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -19,7 +19,7 @@ from logging import DEBUG, FileHandler, Formatter, Logger, getLogger from multiprocessing import cpu_count from pathlib import Path -from typing import Any, Literal, TypeVar, overload +from typing import Literal, TypeVar, overload import joblib import mlflow @@ -33,7 +33,8 @@ from atom.utils.types import ( Backend, Bool, Engine, EngineDataOptions, EngineEstimatorOptions, EngineTuple, Estimator, FeatureNamesOut, Int, IntLargerEqualZero, Pandas, - Severity, Verbose, Warnings, XSelector, YSelector, bool_t, int_t, YReturn, XReturn + Severity, Verbose, Warnings, XReturn, XSelector, YReturn, YSelector, + bool_t, int_t, ) from atom.utils.utils import ( check_dependency, crash, lst, make_sklearn, to_df, to_tabular, diff --git a/atom/data/dataengines.py b/atom/data/dataengines.py index 74fb19594..7d5d4500c 100644 --- a/atom/data/dataengines.py +++ b/atom/data/dataengines.py @@ -21,8 +21,8 @@ import modin.pandas as md import polars as pl import pyarrow as pa - import pyspark.sql as psql import pyspark.pandas as ps + import pyspark.sql as psql class DataEngine(metaclass=ABCMeta): diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index f778e019d..e713844e0 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -10,7 +10,7 @@ import re from collections import defaultdict from collections.abc import Hashable -from typing import Any, Literal, TypeVar, cast, overload +from typing import Any, Literal, TypeVar, overload import numpy as np import pandas as pd @@ -52,9 +52,9 @@ Bins, Bool, CategoricalStrats, DiscretizerStrats, Engine, EngineDataOptions, EngineTuple, Estimator, FloatLargerZero, Int, IntLargerEqualZero, IntLargerTwo, IntLargerZero, NJobs, NormalizerStrats, - NumericalStrats, Pandas, Predictor, PrunerStrats, Scalar, ScalerStrats, - SeasonalityModels, Sequence, Transformer, Verbose, XConstructor, - YConstructor, sequence_t, XReturn, YReturn, + NumericalStrats, Predictor, PrunerStrats, Scalar, ScalerStrats, + SeasonalityModels, Sequence, Transformer, Verbose, XConstructor, XReturn, + YConstructor, YReturn, sequence_t, ) from atom.utils.utils import ( Goal, check_is_fitted, get_col_names, get_col_order, get_cols, it, lst, diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index c333fd86b..430d7fe71 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -9,7 +9,7 @@ from collections.abc import Hashable from random import sample -from typing import Any, Literal +from typing import Any, Literal, cast import featuretools as ft import numpy as np @@ -36,7 +36,7 @@ Bool, Engine, FeatureSelectionSolvers, FeatureSelectionStrats, FloatLargerEqualZero, FloatLargerZero, FloatZeroToOneInc, IntLargerEqualZero, IntLargerZero, NJobs, Operators, Scalar, Sequence, - Verbose, XConstructor, YConstructor, XReturn + Verbose, XConstructor, XReturn, YConstructor, ) from atom.utils.utils import ( Goal, Task, check_is_fitted, check_scaling, get_custom_scorer, is_sparse, @@ -1040,7 +1040,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): Xt = to_df(X) yt = to_tabular(y, index=Xt.index) - if yt is None and self.strategy != "pca": + if yt is None and self.strategy not in ("pca", "sfm", None): raise ValueError( "Invalid value for the y parameter. Value cannot " f"be None for strategy='{self.strategy}'." @@ -1248,7 +1248,7 @@ def objective_function(model, X_train, y_train, X_valid, y_valid, scoring): # PCA requires the features to be scaled if not check_scaling(Xt): self.scaler_ = Scaler(device=self.device, engine=self.engine) - Xt = self.scaler_.fit_transform(Xt) + Xt = cast(pd.DataFrame, self.scaler_.fit_transform(Xt)) estimator = self._get_est_class("PCA", "decomposition") solver_param = "svd_solver" @@ -1509,7 +1509,7 @@ def transform(self, X: XConstructor, y: YConstructor | None = None) -> XReturn: if self.scaler_: self._log(" --> Scaling features...", 2) - Xt = self.scaler_.transform(Xt) + Xt = cast(pd.DataFrame, self.scaler_.transform(Xt)) Xt = self._estimator.transform(Xt).iloc[:, :self._estimator._comps] diff --git a/atom/nlp.py b/atom/nlp.py index 751ea637c..392124eb9 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -22,7 +22,7 @@ from atom.data_cleaning import TransformerMixin from atom.utils.types import ( Bool, Engine, FloatLargerZero, Sequence, VectorizerStarts, Verbose, - XConstructor, YConstructor, bool_t, XReturn + XConstructor, XReturn, YConstructor, bool_t, ) from atom.utils.utils import ( check_is_fitted, check_nltk_module, get_corpus, is_sparse, merge, to_df, diff --git a/atom/pipeline.py b/atom/pipeline.py index f14686485..d4d57b391 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -27,11 +27,12 @@ from atom.utils.types import ( Bool, EngineDataOptions, EngineTuple, Estimator, FHConstructor, Float, - Pandas, Scalar, Sequence, Verbose, XConstructor, YConstructor, YReturn, XReturn + Pandas, Scalar, Sequence, Verbose, XConstructor, XReturn, YConstructor, + YReturn, ) from atom.utils.utils import ( - NotFittedError, adjust, check_is_fitted, fit_one, fit_transform_one, - transform_one, variable_return, to_df, to_tabular + NotFittedError, adjust, check_is_fitted, fit_one, fit_transform_one, to_df, + to_tabular, transform_one, variable_return, ) diff --git a/atom/utils/types.py b/atom/utils/types.py index d7871824a..8a86a3067 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -8,7 +8,7 @@ from __future__ import annotations import os -from collections.abc import Callable, Hashable, Iterable, Iterator +from collections.abc import Callable, Hashable, Iterator from importlib.util import find_spec from typing import ( TYPE_CHECKING, Annotated, Any, Literal, NamedTuple, SupportsIndex, diff --git a/atom/utils/utils.py b/atom/utils/utils.py index c3adfbfc3..79fe21dd3 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -22,7 +22,7 @@ from inspect import Parameter, signature from itertools import cycle from types import GeneratorType, MappingProxyType -from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload, cast +from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload import numpy as np import pandas as pd @@ -32,6 +32,7 @@ from pandas._libs.missing import NAType from pandas._typing import Axes, Dtype from pandas.api.types import is_numeric_dtype +from pandas.core.generic import NDFrame from sklearn.base import BaseEstimator from sklearn.base import OneToOneFeatureMixin as FMixin from sklearn.metrics import ( @@ -46,9 +47,9 @@ Bool, EngineDataOptions, EngineTuple, Estimator, FeatureNamesOut, Float, IndexSelector, Int, IntLargerEqualZero, MetricFunction, Model, Pandas, Predictor, Scalar, Scorer, Segment, Sequence, SPTuple, Transformer, - Verbose, XConstructor, YConstructor, int_t, segment_t, sequence_t, XReturn, YReturn + Verbose, XConstructor, XReturn, YConstructor, YReturn, int_t, segment_t, + sequence_t, ) -from pandas.core.generic import NDFrame if TYPE_CHECKING: @@ -2180,7 +2181,7 @@ def name_cols( # If columns were added or removed temp_cols = [] - for i, (name, column) in enumerate(df.items()): + for i, column in enumerate(get_cols(df)): # equal_nan=True fails for non-numeric dtypes mask = original_df.apply( # type: ignore[type-var] lambda c: np.array_equal( @@ -2483,7 +2484,8 @@ def prepare_df(out: XConstructor, og: pd.DataFrame) -> pd.DataFrame: elif "X" not in params: return X, y # If y is None and no X in transformer, skip the transformer - out: YConstructor | tuple[XConstructor, YConstructor] = getattr(transformer, method)(**kwargs, **transform_params) + caller = getattr(transformer, method) + out: YConstructor | tuple[XConstructor, YConstructor] = caller(**kwargs, **transform_params) # Transform can return X, y or both X_new: pd.DataFrame | None diff --git a/tests/conftest.py b/tests/conftest.py index c416ad7e3..4e58c2c77 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,7 +33,9 @@ from _pytest.monkeypatch import MonkeyPatch - from atom.utils.types import DataFrame, Pandas, Sequence, XSelector, XConstructor + from atom.utils.types import ( + DataFrame, Pandas, Sequence, XConstructor, + ) class DummyTransformer(TransformerMixin, BaseEstimator): diff --git a/tests/test_data.py b/tests/test_data.py index 9d36db453..9bee37d88 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -10,7 +10,6 @@ from unittest.mock import MagicMock, patch import dask.dataframe as dd -import modin.pandas as md import numpy as np import pandas as pd import polars as pl diff --git a/tests/test_data_cleaning.py b/tests/test_data_cleaning.py index c888b4525..8c5bd88ff 100644 --- a/tests/test_data_cleaning.py +++ b/tests/test_data_cleaning.py @@ -423,8 +423,8 @@ def test_missing_values_are_propagated(): def test_unknown_classes_are_imputed(): """Assert that unknown classes are imputed.""" encoder = Encoder() - encoder.fit(["a", "b", "b", "a"]) - assert encoder.transform(["c"]).iloc[0, 0] == -1.0 + encoder.fit([["a"], ["b"], ["b"], ["a"]]) + assert encoder.transform([["c"]]).iloc[0, 0] == -1.0 def test_ordinal_encoder(): From a59a3b51c8c22db50e8bc78fafb4d075b7fa0ed0 Mon Sep 17 00:00:00 2001 From: Marco van den Boom Date: Mon, 26 Feb 2024 19:19:39 +0100 Subject: [PATCH 12/12] dataengines final --- atom/_show_versions.py | 27 ++++++--- atom/atom.py | 3 +- atom/basemodel.py | 114 ++++++++++++++++++----------------- atom/basetransformer.py | 8 +-- atom/data/branch.py | 16 ++--- atom/data_cleaning.py | 6 +- atom/utils/utils.py | 30 ++++----- docs_sources/dependencies.md | 4 +- pyproject.toml | 4 +- tests/conftest.py | 4 +- 10 files changed, 113 insertions(+), 103 deletions(-) diff --git a/atom/_show_versions.py b/atom/_show_versions.py index 22e02a07f..56dfcbdc9 100644 --- a/atom/_show_versions.py +++ b/atom/_show_versions.py @@ -20,12 +20,11 @@ "atom", "beartype", "category_encoders", - "dagshub", "dill", + "featuretools", "gplearn", "imblearn", "ipywidgets", - "featuretools", "joblib", "matplotlib", "mlflow", @@ -35,17 +34,31 @@ "optuna", "pandas", "plotly", - "polars", - "pyarrow", - "ray", - "requests", "sklearn", - "sklearnex", # Has no __version__ attribute "scipy", "shap", "sktime", "statsmodels", "zoofs", # Has no __version__ attribute + "botorch", + "catboost", + "dagshub", + "dask[distributed]", + "explainerdashboard", + "gradio", + "lightgbm", + "modin[ray]", + "polars", + "pyarrow", + "pyspark", + "ray[serve]", + "requests", + "sklearnex", + "schemdraw", + "statsforecast", + "sweetviz", + "wordcloud", + "xgboost", ] diff --git a/atom/atom.py b/atom/atom.py index 974838f17..8b6c600de 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -748,12 +748,11 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM **X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features). - **y: int, str, dict, sequence or dataframe**
+ **y: int, str, sequence or dataframe**
Target column(s) corresponding to `X`. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. diff --git a/atom/basemodel.py b/atom/basemodel.py index 6fb2ed6d4..170940b93 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -15,7 +15,7 @@ from importlib import import_module from logging import Logger from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, overload +from typing import TYPE_CHECKING, Any, Literal, cast, overload from unittest.mock import patch import dill as pickle @@ -274,7 +274,8 @@ def __init__( self._train_idx = len(self.branch._data.train_idx) # Can change for sh and ts if getattr(self, "needs_scaling", None) and not self.branch.check_scaling(): - self.scaler = Scaler(device=self.device, engine=self.engine).fit(self.X_train) + self.scaler = Scaler(device=self.device, engine=self.engine.estimator) + self.scaler.fit(self.X_train) def __repr__(self) -> str: """Display class name.""" @@ -704,7 +705,7 @@ def _get_pred( # Statsmodels models such as SARIMAX and DF require all # exogenous data after the last row of the train set # Other models accept this format - Xe = pd.concat([self.test, self.holdout]) # type: ignore[list-item] + Xe = pd.concat([self.test, self.holdout]) exog = Xe.loc[Xe.index <= X.index.max(), self.features] # type: ignore[index] y_pred = self._prediction( @@ -1680,10 +1681,11 @@ def y(self) -> Pandas: def X_train(self) -> pd.DataFrame: """Features of the training set.""" features = self.branch.features.isin(self._config.ignore) + X_train = self.branch.X_train.iloc[-self._train_idx:, ~features] if self.scaler: - return self.scaler.transform(self.branch.X_train.iloc[-self._train_idx:, ~features]) + return cast(pd.DataFrame, self.scaler.transform(X_train)) else: - return self.branch.X_train.iloc[-self._train_idx:, ~features] + return X_train @property def y_train(self) -> Pandas: @@ -1694,10 +1696,11 @@ def y_train(self) -> Pandas: def X_test(self) -> pd.DataFrame: """Features of the test set.""" features = self.branch.features.isin(self._config.ignore) + X_test = self.branch.X_test.iloc[:, ~features] if self.scaler: - return self.scaler.transform(self.branch.X_test.iloc[:, ~features]) + return cast(pd.DataFrame, self.scaler.transform(X_test)) else: - return self.branch.X_test.iloc[:, ~features] + return X_test @property def X_holdout(self) -> pd.DataFrame | None: @@ -2195,11 +2198,11 @@ def full_train(self, *, include_holdout: Bool = False): if include_holdout and self.holdout is None: raise ValueError("No holdout data set available.") - if include_holdout and self.holdout is not None: + if not include_holdout: + X, y = self.X, self.y + else: X = pd.concat([self.X, self.X_holdout]) y = pd.concat([self.y, self.y_holdout]) - else: - X, y = self.X, self.y # Assign a mlflow run to the new estimator if self.experiment: @@ -2518,17 +2521,6 @@ def get_tags(self) -> dict[str, Any]: "supports_engines": ", ".join(getattr(self, "supports_engines", [])), } - @overload - def _prediction( - self, - X: RowSelector | XSelector, - y: YSelector | None = ..., - metric: str | MetricFunction | Scorer | None = ..., - sample_weight: Sequence[Scalar] | None = ..., - verbose: Verbose | None = ..., - method: Literal["score"] = ..., - ) -> Float: ... - @overload def _prediction( self, @@ -2545,6 +2537,17 @@ def _prediction( ] = ..., ) -> Pandas: ... + @overload + def _prediction( + self, + X: RowSelector | XSelector, + y: YSelector | None, + metric: str | MetricFunction | Scorer | None, + sample_weight: Sequence[Scalar] | None, + verbose: Verbose | None, + method: Literal["score"], + ) -> Float: ... + def _prediction( self, X: RowSelector | XSelector, @@ -2567,13 +2570,12 @@ def _prediction( set with shape=(n_samples, n_features) to make predictions on. - y: int, str, dict, sequence, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - If None: `y` is ignored. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -2603,23 +2605,26 @@ def _prediction( """ - def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[pd.DataFrame, Pandas]: + def get_transform_X_y( + X: RowSelector | XSelector, + y: YSelector | None, + ) -> tuple[pd.DataFrame, Pandas | None]: """Get X and y from the pipeline transformation. Parameters ---------- - X: dataframe-like - Feature set. + X: hashable, segment, sequence or dataframe-like + Feature set. If not dataframe-like, expected to fail. - y: int, str or sequence - Target column(s). + y: int, str, sequence, dataframe-like or None + Target column(s) corresponding to `X`. Returns ------- dataframe Transformed feature set. - series or dataframe + series, dataframe or None Transformed target column. """ @@ -2889,13 +2894,12 @@ def score( set with shape=(n_samples, n_features) to make predictions on. - y: int, str, dict, sequence, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - If None: `X` must be a selection of rows in the dataset. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -2965,39 +2969,39 @@ def _prediction( X: XSelector | None = ..., metric: str | MetricFunction | Scorer | None = ..., verbose: Verbose | None = ..., - method: Literal["score"] = ..., + method: Literal[ + "predict", + "predict_interval", + "predict_quantiles", + "predict_residuals", + "predict_var", + ] = ..., **kwargs, - ) -> Float: ... + ) -> Pandas: ... @overload def _prediction( self, - fh: RowSelector | FHConstructor | None = ..., - y: RowSelector | YSelector | None = ..., - X: XSelector | None = ..., - metric: str | MetricFunction | Scorer | None = ..., - verbose: Verbose | None = ..., - method: Literal["predict_proba"] = ..., + fh: RowSelector | FHConstructor | None, + y: RowSelector | YSelector | None, + X: XSelector | None, + metric: str | MetricFunction | Scorer | None, + verbose: Verbose | None, + method: Literal["predict_proba"], **kwargs, ) -> Normal: ... @overload def _prediction( self, - fh: RowSelector | FHConstructor | None = ..., - y: RowSelector | YSelector | None = ..., - X: XSelector | None = ..., - metric: str | MetricFunction | Scorer | None = ..., - verbose: Verbose | None = ..., - method: Literal[ - "predict", - "predict_interval", - "predict_quantiles", - "predict_residuals", - "predict_var", - ] = ..., + fh: RowSelector | FHConstructor | None, + y: RowSelector | YSelector | None, + X: XSelector | None, + metric: str | MetricFunction | Scorer | None, + verbose: Verbose | None, + method: Literal["score"], **kwargs, - ) -> Pandas: ... + ) -> Float: ... def _prediction( self, @@ -3021,7 +3025,7 @@ def _prediction( The [forecasting horizon][row-and-column-selection] encoding the time stamps to forecast at. - y: int, str, dict, sequence, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None @@ -3299,7 +3303,7 @@ def predict_residuals( Parameters ---------- - y: int, str, dict, sequence or dataframe + y: int, str, sequence or dataframe Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None @@ -3397,7 +3401,7 @@ def score( Parameters ---------- - y: int, str, dict, sequence or dataframe-like + y: int, str, sequence or dataframe-like Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None diff --git a/atom/basetransformer.py b/atom/basetransformer.py index 5e2dfee29..859d3f930 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -181,12 +181,12 @@ def backend(self, value: Backend): elif value == "dask": check_dependency("dask") - import dask + from dask.distributed import Client try: - dask.distributed.Client.current() + Client.current() except ValueError: - dask.distributed.Client(processes=False) + Client(processes=False) joblib.parallel_config(backend=value) @@ -369,7 +369,7 @@ def _device_id(self) -> int: @overload def _check_input( X: XSelector, - y: Literal[None] = ..., + y: Literal[None], *, columns: Axes | None = ..., name: str | Axes | None = ..., diff --git a/atom/data/branch.py b/atom/data/branch.py index fd5710ab4..d2f1f20b4 100644 --- a/atom/data/branch.py +++ b/atom/data/branch.py @@ -428,9 +428,9 @@ def shape(self) -> tuple[Int, Int]: return self.dataset.shape @property - def columns(self) -> pd.Index: + def columns(self) -> list[str]: """Name of all the columns.""" - return self.dataset.columns + return list(self.dataset.columns) @property def n_columns(self) -> int: @@ -438,9 +438,9 @@ def n_columns(self) -> int: return len(self.columns) @property - def features(self) -> pd.Index: + def features(self) -> list[str]: """Name of the features.""" - return self.columns[:-self._data.n_targets] + return list(self.columns[:-self._data.n_targets]) @property def n_features(self) -> int: @@ -460,7 +460,7 @@ def _all(self) -> pd.DataFrame: calculation. """ - return pd.concat([self.dataset, self.holdout]) # type: ignore[list-item] + return pd.concat([self.dataset, self.holdout]) # Utility methods ============================================== >> @@ -580,10 +580,12 @@ def _get_rows( # If rows were excluded with `!`, select all but those inc = list(_all.index[~_all.index.isin(exc)]) + rows_c = _all.loc[inc] + if return_X_y: - return _all.loc[inc, self.features], _all.loc[inc, self.target] # type: ignore[index] + return rows_c[self.features], rows_c[self.target] else: - return self._all.loc[inc] + return rows_c def _get_columns( self, diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index e713844e0..2861c0326 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -820,10 +820,10 @@ def fit(self, X: XConstructor | None = None, y: YConstructor | None = None) -> S self.target_names_in_ = np.array(get_col_names(yt)) if self.drop_chars: - if isinstance(yt, pd.Series): - yt.name = re.sub(self.drop_chars, "", str(yt.name)) - else: + if isinstance(yt, pd.DataFrame): yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) + else: + yt.name = re.sub(self.drop_chars, "", str(yt.name)) if self.drop_missing_target: yt = replace_missing(yt, self.missing_).dropna(axis=0) diff --git a/atom/utils/utils.py b/atom/utils/utils.py index 79fe21dd3..10354963c 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -24,15 +24,19 @@ from types import GeneratorType, MappingProxyType from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload +import mlflow +import nltk import numpy as np import pandas as pd +import plotly.graph_objects as go import scipy.sparse as sps from beartype.door import is_bearable from IPython.display import display +from matplotlib.colors import to_rgba from pandas._libs.missing import NAType from pandas._typing import Axes, Dtype from pandas.api.types import is_numeric_dtype -from pandas.core.generic import NDFrame +from shap import Explainer from sklearn.base import BaseEstimator from sklearn.base import OneToOneFeatureMixin as FMixin from sklearn.metrics import ( @@ -55,7 +59,7 @@ if TYPE_CHECKING: from optuna.study import Study from optuna.trial import FrozenTrial - from shap import Explainer, Explanation + from shap import Explanation from atom.basemodel import BaseModel from atom.baserunner import BaseRunner @@ -63,7 +67,7 @@ T = TypeVar("T") -T_Pandas = TypeVar("T_Pandas", bound=NDFrame) +T_Pandas = TypeVar("T_Pandas", pd.Series, pd.DataFrame, pd.Series | pd.DataFrame) T_Transformer = TypeVar("T_Transformer", bound=Transformer) T_Estimator = TypeVar("T_Estimator", bound=Estimator) @@ -633,8 +637,6 @@ def __call__(self, study: Study, trial: FrozenTrial): # Save trials to mlflow experiment as nested runs if self.T.experiment and self.T.log_ht: - import mlflow - with mlflow.start_run(run_id=self.T.run.info.run_id): run_name = f"{self.T.name} - {trial.number}" with mlflow.start_run(run_name=run_name, nested=True): @@ -734,8 +736,6 @@ class PlotCallback: max_len = 15 # Maximum trials to show at once in the plot def __init__(self, name: str, metric: list[str], aesthetics: Aesthetics): - import plotly.graph_objects as go - self.y1: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))} self.y2: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))} @@ -925,8 +925,6 @@ def explainer(self) -> Explainer: Get the initialized explainer object. """ - from shap import Explainer - kwargs = { "masker": self.branch.X_train, "feature_names": list(self.branch.features), @@ -1286,8 +1284,6 @@ def to_rgb(c: str) -> str: Color's RGB representation. """ - from matplotlib.colors import to_rgba - if not c.startswith("rgb"): colors = to_rgba(c)[:3] return f"rgb({colors[0]}, {colors[1]}, {colors[2]})" @@ -1375,15 +1371,15 @@ def get_nan(dtype: Dtype) -> float | NAType: # Always convert these values default_values = [None, pd.NA, pd.NaT, np.NaN, np.inf, -np.inf] - if isinstance(X, pd.Series): + if isinstance(X, pd.DataFrame): return X.replace( - to_replace=(missing_values or []) + default_values, - value=get_nan(X.dtype), + to_replace={c: (missing_values or []) + default_values for c in X.columns}, + value={c: get_nan(d) for c, d in X.dtypes.items()}, ) else: return X.replace( - to_replace={c: (missing_values or []) + default_values for c in X.columns}, - value={c: get_nan(d) for c, d in X.dtypes.items()}, + to_replace=(missing_values or []) + default_values, + value=get_nan(X.dtype), ) @@ -1584,8 +1580,6 @@ def check_nltk_module(module: str, *, quiet: bool): Whether to show logs when downloading. """ - import nltk - try: nltk.data.find(module) except LookupError: diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md index f464f3795..7dcaa5c61 100644 --- a/docs_sources/dependencies.md +++ b/docs_sources/dependencies.md @@ -34,7 +34,7 @@ packages are necessary for its correct functioning. * **[ipywidgets](https://pypi.org/project/ipywidgets/)** (>=8.1.1) * **[joblib](https://joblib.readthedocs.io/en/latest/)** (>=1.3.1) * **[matplotlib](https://matplotlib.org/)** (>=3.7.2) -* **[mlflow](https://mlflow.org/)** (>=2.7.1) +* **[mlflow](https://mlflow.org/)** (>=2.10.2) * **[nltk](https://www.nltk.org/)** (>=3.8.1) * **[numpy](https://numpy.org/)** (>=1.23.0) * **[optuna](https://optuna.org/)** (>=3.4.0) @@ -57,7 +57,7 @@ additional libraries. You can install all the optional dependencies using * **[botorch](https://botorch.org/docs/introduction)** (>=0.8.5) * **[catboost](https://catboost.ai/docs/concepts/about.html)** (>=1.2) * **[dagshub](https://github.com/DagsHub/client)** (>=0.3.8) -* **[dask](https://dask.org/)** (>=2024.2.0) +* **[dask[distributed]](https://dask.org/)** (>=2024.2.0) * **[explainerdashboard](https://explainerdashboard.readthedocs.io/en/latest/)** (>=0.4.3) * **[gradio](https://github.com/gradio-app/gradio)** (>=3.44.4) * **[lightgbm](https://lightgbm.readthedocs.io/en/latest/)** (>=4.1.0) diff --git a/pyproject.toml b/pyproject.toml index 866f05ed6..981280631 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "ipywidgets>=8.1.1", "joblib>=1.3.1", "matplotlib>=3.7.2", - "mlflow>=2.7.1", + "mlflow>=2.10.2", "nltk>=3.8.1", "numpy>=1.23.0", "optuna>=3.4.0", @@ -48,7 +48,7 @@ full = [ "botorch>=0.8.5", "catboost>=1.2", "dagshub>=0.3.8", - "dask>=2024.2.0", + "dask[distributed]>=2024.2.0", "explainerdashboard>=0.4.3", "gradio>=3.44.4", "lightgbm>=4.1.0", diff --git a/tests/conftest.py b/tests/conftest.py index 4e58c2c77..97c7858ba 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,9 +33,7 @@ from _pytest.monkeypatch import MonkeyPatch - from atom.utils.types import ( - DataFrame, Pandas, Sequence, XConstructor, - ) + from atom.utils.types import DataFrame, Pandas, Sequence, XConstructor class DummyTransformer(TransformerMixin, BaseEstimator):