dataengines final

tvdboom · Feb 26, 2024 · a59a3b5 · a59a3b5
1 parent c3d9c6b
commit a59a3b5
Show file tree

Hide file tree

Showing 10 changed files with 113 additions and 103 deletions.
diff --git a/atom/_show_versions.py b/atom/_show_versions.py
@@ -20,12 +20,11 @@
     "atom",
     "beartype",
     "category_encoders",
-    "dagshub",
     "dill",
+    "featuretools",
     "gplearn",
     "imblearn",
     "ipywidgets",
-    "featuretools",
     "joblib",
     "matplotlib",
     "mlflow",
@@ -35,17 +34,31 @@
     "optuna",
     "pandas",
     "plotly",
-    "polars",
-    "pyarrow",
-    "ray",
-    "requests",
     "sklearn",
-    "sklearnex",  # Has no __version__ attribute
     "scipy",
     "shap",
     "sktime",
     "statsmodels",
     "zoofs",  # Has no __version__ attribute
+    "botorch",
+    "catboost",
+    "dagshub",
+    "dask[distributed]",
+    "explainerdashboard",
+    "gradio",
+    "lightgbm",
+    "modin[ray]",
+    "polars",
+    "pyarrow",
+    "pyspark",
+    "ray[serve]",
+    "requests",
+    "sklearnex",
+    "schemdraw",
+    "statsforecast",
+    "sweetviz",
+    "wordcloud",
+    "xgboost",
 ]
 
 

diff --git a/atom/atom.py b/atom/atom.py
@@ -748,12 +748,11 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM
             **X, train, test: dataframe-like**<br>
             Feature set with shape=(n_samples, n_features).
 
-            **y: int, str, dict, sequence or dataframe**<br>
+            **y: int, str, sequence or dataframe**<br>
             Target column(s) corresponding to `X`.
 
             - If int: Position of the target column in `X`.
             - If str: Name of the target column in `X`.
-            - If dict: Name of the target column and sequence of values.
             - If sequence: Target column with shape=(n_samples,) or
               sequence of column names or positions for multioutput
               tasks.

diff --git a/atom/basemodel.py b/atom/basemodel.py
@@ -15,7 +15,7 @@
 from importlib import import_module
 from logging import Logger
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal, overload
+from typing import TYPE_CHECKING, Any, Literal, cast, overload
 from unittest.mock import patch
 
 import dill as pickle
@@ -274,7 +274,8 @@ def __init__(
             self._train_idx = len(self.branch._data.train_idx)  # Can change for sh and ts
 
             if getattr(self, "needs_scaling", None) and not self.branch.check_scaling():
-                self.scaler = Scaler(device=self.device, engine=self.engine).fit(self.X_train)
+                self.scaler = Scaler(device=self.device, engine=self.engine.estimator)
+                self.scaler.fit(self.X_train)
 
     def __repr__(self) -> str:
         """Display class name."""
@@ -704,7 +705,7 @@ def _get_pred(
                     # Statsmodels models such as SARIMAX and DF require all
                     # exogenous data after the last row of the train set
                     # Other models accept this format
-                    Xe = pd.concat([self.test, self.holdout])  # type: ignore[list-item]
+                    Xe = pd.concat([self.test, self.holdout])
                     exog = Xe.loc[Xe.index <= X.index.max(), self.features]  # type: ignore[index]
 
                 y_pred = self._prediction(
@@ -1680,10 +1681,11 @@ def y(self) -> Pandas:
     def X_train(self) -> pd.DataFrame:
         """Features of the training set."""
         features = self.branch.features.isin(self._config.ignore)
+        X_train = self.branch.X_train.iloc[-self._train_idx:, ~features]
         if self.scaler:
-            return self.scaler.transform(self.branch.X_train.iloc[-self._train_idx:, ~features])
+            return cast(pd.DataFrame, self.scaler.transform(X_train))
         else:
-            return self.branch.X_train.iloc[-self._train_idx:, ~features]
+            return X_train
 
     @property
     def y_train(self) -> Pandas:
@@ -1694,10 +1696,11 @@ def y_train(self) -> Pandas:
     def X_test(self) -> pd.DataFrame:
         """Features of the test set."""
         features = self.branch.features.isin(self._config.ignore)
+        X_test = self.branch.X_test.iloc[:, ~features]
         if self.scaler:
-            return self.scaler.transform(self.branch.X_test.iloc[:, ~features])
+            return cast(pd.DataFrame, self.scaler.transform(X_test))
         else:
-            return self.branch.X_test.iloc[:, ~features]
+            return X_test
 
     @property
     def X_holdout(self) -> pd.DataFrame | None:
@@ -2195,11 +2198,11 @@ def full_train(self, *, include_holdout: Bool = False):
         if include_holdout and self.holdout is None:
             raise ValueError("No holdout data set available.")
 
-        if include_holdout and self.holdout is not None:
+        if not include_holdout:
+            X, y = self.X, self.y
+        else:
             X = pd.concat([self.X, self.X_holdout])
             y = pd.concat([self.y, self.y_holdout])
-        else:
-            X, y = self.X, self.y
 
         # Assign a mlflow run to the new estimator
         if self.experiment:
@@ -2518,17 +2521,6 @@ def get_tags(self) -> dict[str, Any]:
             "supports_engines": ", ".join(getattr(self, "supports_engines", [])),
         }
 
-    @overload
-    def _prediction(
-        self,
-        X: RowSelector | XSelector,
-        y: YSelector | None = ...,
-        metric: str | MetricFunction | Scorer | None = ...,
-        sample_weight: Sequence[Scalar] | None = ...,
-        verbose: Verbose | None = ...,
-        method: Literal["score"] = ...,
-    ) -> Float: ...
-
     @overload
     def _prediction(
         self,
@@ -2545,6 +2537,17 @@ def _prediction(
         ] = ...,
     ) -> Pandas: ...
 
+    @overload
+    def _prediction(
+        self,
+        X: RowSelector | XSelector,
+        y: YSelector | None,
+        metric: str | MetricFunction | Scorer | None,
+        sample_weight: Sequence[Scalar] | None,
+        verbose: Verbose | None,
+        method: Literal["score"],
+    ) -> Float: ...
+
     def _prediction(
         self,
         X: RowSelector | XSelector,
@@ -2567,13 +2570,12 @@ def _prediction(
             set with shape=(n_samples, n_features) to make predictions
             on.
 
-        y: int, str, dict, sequence, dataframe-like or None, default=None
+        y: int, str, sequence, dataframe-like or None, default=None
             Target column(s) corresponding to `X`.
 
             - If None: `y` is ignored.
             - If int: Position of the target column in `X`.
             - If str: Name of the target column in `X`.
-            - If dict: Name of the target column and sequence of values.
             - If sequence: Target column with shape=(n_samples,) or
               sequence of column names or positions for multioutput
               tasks.
@@ -2603,23 +2605,26 @@ def _prediction(
 
         """
 
-        def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[pd.DataFrame, Pandas]:
+        def get_transform_X_y(
+            X: RowSelector | XSelector,
+            y: YSelector | None,
+        ) -> tuple[pd.DataFrame, Pandas | None]:
             """Get X and y from the pipeline transformation.
 
             Parameters
             ----------
-            X: dataframe-like
-                Feature set.
+            X: hashable, segment, sequence or dataframe-like
+                Feature set. If not dataframe-like, expected to fail.
 
-            y: int, str or sequence
-                Target column(s).
+            y: int, str, sequence, dataframe-like or None
+                Target column(s) corresponding to `X`.
 
             Returns
             -------
             dataframe
                 Transformed feature set.
 
-            series or dataframe
+            series, dataframe or None
                 Transformed target column.
 
             """
@@ -2889,13 +2894,12 @@ def score(
             set with shape=(n_samples, n_features) to make predictions
             on.
 
-        y: int, str, dict, sequence, dataframe-like or None, default=None
+        y: int, str, sequence, dataframe-like or None, default=None
             Target column(s) corresponding to `X`.
 
             - If None: `X` must be a selection of rows in the dataset.
             - If int: Position of the target column in `X`.
             - If str: Name of the target column in `X`.
-            - If dict: Name of the target column and sequence of values.
             - If sequence: Target column with shape=(n_samples,) or
               sequence of column names or positions for multioutput
               tasks.
@@ -2965,39 +2969,39 @@ def _prediction(
         X: XSelector | None = ...,
         metric: str | MetricFunction | Scorer | None = ...,
         verbose: Verbose | None = ...,
-        method: Literal["score"] = ...,
+        method: Literal[
+            "predict",
+            "predict_interval",
+            "predict_quantiles",
+            "predict_residuals",
+            "predict_var",
+        ] = ...,
         **kwargs,
-    ) -> Float: ...
+    ) -> Pandas: ...
 
     @overload
     def _prediction(
         self,
-        fh: RowSelector | FHConstructor | None = ...,
-        y: RowSelector | YSelector | None = ...,
-        X: XSelector | None = ...,
-        metric: str | MetricFunction | Scorer | None = ...,
-        verbose: Verbose | None = ...,
-        method: Literal["predict_proba"] = ...,
+        fh: RowSelector | FHConstructor | None,
+        y: RowSelector | YSelector | None,
+        X: XSelector | None,
+        metric: str | MetricFunction | Scorer | None,
+        verbose: Verbose | None,
+        method: Literal["predict_proba"],
         **kwargs,
     ) -> Normal: ...
 
     @overload
     def _prediction(
         self,
-        fh: RowSelector | FHConstructor | None = ...,
-        y: RowSelector | YSelector | None = ...,
-        X: XSelector | None = ...,
-        metric: str | MetricFunction | Scorer | None = ...,
-        verbose: Verbose | None = ...,
-        method: Literal[
-            "predict",
-            "predict_interval",
-            "predict_quantiles",
-            "predict_residuals",
-            "predict_var",
-        ] = ...,
+        fh: RowSelector | FHConstructor | None,
+        y: RowSelector | YSelector | None,
+        X: XSelector | None,
+        metric: str | MetricFunction | Scorer | None,
+        verbose: Verbose | None,
+        method: Literal["score"],
         **kwargs,
-    ) -> Pandas: ...
+    ) -> Float: ...
 
     def _prediction(
         self,
@@ -3021,7 +3025,7 @@ def _prediction(
             The [forecasting horizon][row-and-column-selection] encoding
             the time stamps to forecast at.
 
-        y: int, str, dict, sequence, dataframe-like or None, default=None
+        y: int, str, sequence, dataframe-like or None, default=None
             Ground truth observations.
 
         X: hashable, segment, sequence, dataframe-like or None, default=None
@@ -3299,7 +3303,7 @@ def predict_residuals(
 
         Parameters
         ----------
-        y: int, str, dict, sequence or dataframe
+        y: int, str, sequence or dataframe
             Ground truth observations.
 
         X: hashable, segment, sequence, dataframe-like or None, default=None
@@ -3397,7 +3401,7 @@ def score(
 
         Parameters
         ----------
-        y: int, str, dict, sequence or dataframe-like
+        y: int, str, sequence or dataframe-like
             Ground truth observations.
 
         X: hashable, segment, sequence, dataframe-like or None, default=None

diff --git a/atom/basetransformer.py b/atom/basetransformer.py
@@ -181,12 +181,12 @@ def backend(self, value: Backend):
 
         elif value == "dask":
             check_dependency("dask")
-            import dask
+            from dask.distributed import Client
 
             try:
-                dask.distributed.Client.current()
+                Client.current()
             except ValueError:
-                dask.distributed.Client(processes=False)
+                Client(processes=False)
 
         joblib.parallel_config(backend=value)
 
@@ -369,7 +369,7 @@ def _device_id(self) -> int:
     @overload
     def _check_input(
         X: XSelector,
-        y: Literal[None] = ...,
+        y: Literal[None],
         *,
         columns: Axes | None = ...,
         name: str | Axes | None = ...,

diff --git a/atom/data/branch.py b/atom/data/branch.py
@@ -428,19 +428,19 @@ def shape(self) -> tuple[Int, Int]:
         return self.dataset.shape
 
     @property
-    def columns(self) -> pd.Index:
+    def columns(self) -> list[str]:
         """Name of all the columns."""
-        return self.dataset.columns
+        return list(self.dataset.columns)
 
     @property
     def n_columns(self) -> int:
         """Number of columns."""
         return len(self.columns)
 
     @property
-    def features(self) -> pd.Index:
+    def features(self) -> list[str]:
         """Name of the features."""
-        return self.columns[:-self._data.n_targets]
+        return list(self.columns[:-self._data.n_targets])
 
     @property
     def n_features(self) -> int:
@@ -460,7 +460,7 @@ def _all(self) -> pd.DataFrame:
         calculation.
 
         """
-        return pd.concat([self.dataset, self.holdout])  # type: ignore[list-item]
+        return pd.concat([self.dataset, self.holdout])
 
     # Utility methods ============================================== >>
 
@@ -580,10 +580,12 @@ def _get_rows(
             # If rows were excluded with `!`, select all but those
             inc = list(_all.index[~_all.index.isin(exc)])
 
+        rows_c = _all.loc[inc]
+
         if return_X_y:
-            return _all.loc[inc, self.features], _all.loc[inc, self.target]  # type: ignore[index]
+            return rows_c[self.features], rows_c[self.target]
         else:
-            return self._all.loc[inc]
+            return rows_c
 
     def _get_columns(
         self,