From a59a3b51c8c22db50e8bc78fafb4d075b7fa0ed0 Mon Sep 17 00:00:00 2001
From: Marco van den Boom <rob6874@robeco.nl>
Date: Mon, 26 Feb 2024 19:19:39 +0100
Subject: [PATCH] dataengines final

---
 atom/_show_versions.py       |  27 ++++++---
 atom/atom.py                 |   3 +-
 atom/basemodel.py            | 114 ++++++++++++++++++-----------------
 atom/basetransformer.py      |   8 +--
 atom/data/branch.py          |  16 ++---
 atom/data_cleaning.py        |   6 +-
 atom/utils/utils.py          |  30 ++++-----
 docs_sources/dependencies.md |   4 +-
 pyproject.toml               |   4 +-
 tests/conftest.py            |   4 +-
 10 files changed, 113 insertions(+), 103 deletions(-)
diff --git a/atom/_show_versions.py b/atom/_show_versions.py
index 22e02a07f..56dfcbdc9 100644
--- a/atom/_show_versions.py
+++ b/atom/_show_versions.py
@@ -20,12 +20,11 @@
     "atom",
     "beartype",
     "category_encoders",
-    "dagshub",
     "dill",
+    "featuretools",
     "gplearn",
     "imblearn",
     "ipywidgets",
-    "featuretools",
     "joblib",
     "matplotlib",
     "mlflow",
@@ -35,17 +34,31 @@
     "optuna",
     "pandas",
     "plotly",
-    "polars",
-    "pyarrow",
-    "ray",
-    "requests",
     "sklearn",
-    "sklearnex",  # Has no __version__ attribute
     "scipy",
     "shap",
     "sktime",
     "statsmodels",
     "zoofs",  # Has no __version__ attribute
+    "botorch",
+    "catboost",
+    "dagshub",
+    "dask[distributed]",
+    "explainerdashboard",
+    "gradio",
+    "lightgbm",
+    "modin[ray]",
+    "polars",
+    "pyarrow",
+    "pyspark",
+    "ray[serve]",
+    "requests",
+    "sklearnex",
+    "schemdraw",
+    "statsforecast",
+    "sweetviz",
+    "wordcloud",
+    "xgboost",
 ]
 
 
diff --git a/atom/atom.py b/atom/atom.py
index 974838f17..8b6c600de 100644
--- a/atom/atom.py
+++ b/atom/atom.py
@@ -748,12 +748,11 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM
             **X, train, test: dataframe-like**<br>
             Feature set with shape=(n_samples, n_features).
 
-            **y: int, str, dict, sequence or dataframe**<br>
+            **y: int, str, sequence or dataframe**<br>
             Target column(s) corresponding to `X`.
 
             - If int: Position of the target column in `X`.
             - If str: Name of the target column in `X`.
-            - If dict: Name of the target column and sequence of values.
             - If sequence: Target column with shape=(n_samples,) or
               sequence of column names or positions for multioutput
               tasks.
diff --git a/atom/basemodel.py b/atom/basemodel.py
index 6fb2ed6d4..170940b93 100644
--- a/atom/basemodel.py
+++ b/atom/basemodel.py
@@ -15,7 +15,7 @@
 from importlib import import_module
 from logging import Logger
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal, overload
+from typing import TYPE_CHECKING, Any, Literal, cast, overload
 from unittest.mock import patch
 
 import dill as pickle
@@ -274,7 +274,8 @@ def __init__(
             self._train_idx = len(self.branch._data.train_idx)  # Can change for sh and ts
 
             if getattr(self, "needs_scaling", None) and not self.branch.check_scaling():
-                self.scaler = Scaler(device=self.device, engine=self.engine).fit(self.X_train)
+                self.scaler = Scaler(device=self.device, engine=self.engine.estimator)
+                self.scaler.fit(self.X_train)
 
     def __repr__(self) -> str:
         """Display class name."""
@@ -704,7 +705,7 @@ def _get_pred(
                     # Statsmodels models such as SARIMAX and DF require all
                     # exogenous data after the last row of the train set
                     # Other models accept this format
-                    Xe = pd.concat([self.test, self.holdout])  # type: ignore[list-item]
+                    Xe = pd.concat([self.test, self.holdout])
                     exog = Xe.loc[Xe.index <= X.index.max(), self.features]  # type: ignore[index]
 
                 y_pred = self._prediction(
@@ -1680,10 +1681,11 @@ def y(self) -> Pandas:
     def X_train(self) -> pd.DataFrame:
         """Features of the training set."""
         features = self.branch.features.isin(self._config.ignore)
+        X_train = self.branch.X_train.iloc[-self._train_idx:, ~features]
         if self.scaler:
-            return self.scaler.transform(self.branch.X_train.iloc[-self._train_idx:, ~features])
+            return cast(pd.DataFrame, self.scaler.transform(X_train))
         else:
-            return self.branch.X_train.iloc[-self._train_idx:, ~features]
+            return X_train
 
     @property
     def y_train(self) -> Pandas:
@@ -1694,10 +1696,11 @@ def y_train(self) -> Pandas:
     def X_test(self) -> pd.DataFrame:
         """Features of the test set."""
         features = self.branch.features.isin(self._config.ignore)
+        X_test = self.branch.X_test.iloc[:, ~features]
         if self.scaler:
-            return self.scaler.transform(self.branch.X_test.iloc[:, ~features])
+            return cast(pd.DataFrame, self.scaler.transform(X_test))
         else:
-            return self.branch.X_test.iloc[:, ~features]
+            return X_test
 
     @property
     def X_holdout(self) -> pd.DataFrame | None:
@@ -2195,11 +2198,11 @@ def full_train(self, *, include_holdout: Bool = False):
         if include_holdout and self.holdout is None:
             raise ValueError("No holdout data set available.")
 
-        if include_holdout and self.holdout is not None:
+        if not include_holdout:
+            X, y = self.X, self.y
+        else:
             X = pd.concat([self.X, self.X_holdout])
             y = pd.concat([self.y, self.y_holdout])
-        else:
-            X, y = self.X, self.y
 
         # Assign a mlflow run to the new estimator
         if self.experiment:
@@ -2518,17 +2521,6 @@ def get_tags(self) -> dict[str, Any]:
             "supports_engines": ", ".join(getattr(self, "supports_engines", [])),
         }
 
-    @overload
-    def _prediction(
-        self,
-        X: RowSelector | XSelector,
-        y: YSelector | None = ...,
-        metric: str | MetricFunction | Scorer | None = ...,
-        sample_weight: Sequence[Scalar] | None = ...,
-        verbose: Verbose | None = ...,
-        method: Literal["score"] = ...,
-    ) -> Float: ...
-
     @overload
     def _prediction(
         self,
@@ -2545,6 +2537,17 @@ def _prediction(
         ] = ...,
     ) -> Pandas: ...
 
+    @overload
+    def _prediction(
+        self,
+        X: RowSelector | XSelector,
+        y: YSelector | None,
+        metric: str | MetricFunction | Scorer | None,
+        sample_weight: Sequence[Scalar] | None,
+        verbose: Verbose | None,
+        method: Literal["score"],
+    ) -> Float: ...
+
     def _prediction(
         self,
         X: RowSelector | XSelector,
@@ -2567,13 +2570,12 @@ def _prediction(
             set with shape=(n_samples, n_features) to make predictions
             on.
 
-        y: int, str, dict, sequence, dataframe-like or None, default=None
+        y: int, str, sequence, dataframe-like or None, default=None
             Target column(s) corresponding to `X`.
 
             - If None: `y` is ignored.
             - If int: Position of the target column in `X`.
             - If str: Name of the target column in `X`.
-            - If dict: Name of the target column and sequence of values.
             - If sequence: Target column with shape=(n_samples,) or
               sequence of column names or positions for multioutput
               tasks.
@@ -2603,23 +2605,26 @@ def _prediction(
 
         """
 
-        def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[pd.DataFrame, Pandas]:
+        def get_transform_X_y(
+            X: RowSelector | XSelector,
+            y: YSelector | None,
+        ) -> tuple[pd.DataFrame, Pandas | None]:
             """Get X and y from the pipeline transformation.
 
             Parameters
             ----------
-            X: dataframe-like
-                Feature set.
+            X: hashable, segment, sequence or dataframe-like
+                Feature set. If not dataframe-like, expected to fail.
 
-            y: int, str or sequence
-                Target column(s).
+            y: int, str, sequence, dataframe-like or None
+                Target column(s) corresponding to `X`.
 
             Returns
             -------
             dataframe
                 Transformed feature set.
 
-            series or dataframe
+            series, dataframe or None
                 Transformed target column.
 
             """
@@ -2889,13 +2894,12 @@ def score(
             set with shape=(n_samples, n_features) to make predictions
             on.
 
-        y: int, str, dict, sequence, dataframe-like or None, default=None
+        y: int, str, sequence, dataframe-like or None, default=None
             Target column(s) corresponding to `X`.
 
             - If None: `X` must be a selection of rows in the dataset.
             - If int: Position of the target column in `X`.
             - If str: Name of the target column in `X`.
-            - If dict: Name of the target column and sequence of values.
             - If sequence: Target column with shape=(n_samples,) or
               sequence of column names or positions for multioutput
               tasks.
@@ -2965,39 +2969,39 @@ def _prediction(
         X: XSelector | None = ...,
         metric: str | MetricFunction | Scorer | None = ...,
         verbose: Verbose | None = ...,
-        method: Literal["score"] = ...,
+        method: Literal[
+            "predict",
+            "predict_interval",
+            "predict_quantiles",
+            "predict_residuals",
+            "predict_var",
+        ] = ...,
         **kwargs,
-    ) -> Float: ...
+    ) -> Pandas: ...
 
     @overload
     def _prediction(
         self,
-        fh: RowSelector | FHConstructor | None = ...,
-        y: RowSelector | YSelector | None = ...,
-        X: XSelector | None = ...,
-        metric: str | MetricFunction | Scorer | None = ...,
-        verbose: Verbose | None = ...,
-        method: Literal["predict_proba"] = ...,
+        fh: RowSelector | FHConstructor | None,
+        y: RowSelector | YSelector | None,
+        X: XSelector | None,
+        metric: str | MetricFunction | Scorer | None,
+        verbose: Verbose | None,
+        method: Literal["predict_proba"],
         **kwargs,
     ) -> Normal: ...
 
     @overload
     def _prediction(
         self,
-        fh: RowSelector | FHConstructor | None = ...,
-        y: RowSelector | YSelector | None = ...,
-        X: XSelector | None = ...,
-        metric: str | MetricFunction | Scorer | None = ...,
-        verbose: Verbose | None = ...,
-        method: Literal[
-            "predict",
-            "predict_interval",
-            "predict_quantiles",
-            "predict_residuals",
-            "predict_var",
-        ] = ...,
+        fh: RowSelector | FHConstructor | None,
+        y: RowSelector | YSelector | None,
+        X: XSelector | None,
+        metric: str | MetricFunction | Scorer | None,
+        verbose: Verbose | None,
+        method: Literal["score"],
         **kwargs,
-    ) -> Pandas: ...
+    ) -> Float: ...
 
     def _prediction(
         self,
@@ -3021,7 +3025,7 @@ def _prediction(
             The [forecasting horizon][row-and-column-selection] encoding
             the time stamps to forecast at.
 
-        y: int, str, dict, sequence, dataframe-like or None, default=None
+        y: int, str, sequence, dataframe-like or None, default=None
             Ground truth observations.
 
         X: hashable, segment, sequence, dataframe-like or None, default=None
@@ -3299,7 +3303,7 @@ def predict_residuals(
 
         Parameters
         ----------
-        y: int, str, dict, sequence or dataframe
+        y: int, str, sequence or dataframe
             Ground truth observations.
 
         X: hashable, segment, sequence, dataframe-like or None, default=None
@@ -3397,7 +3401,7 @@ def score(
 
         Parameters
         ----------
-        y: int, str, dict, sequence or dataframe-like
+        y: int, str, sequence or dataframe-like
             Ground truth observations.
 
         X: hashable, segment, sequence, dataframe-like or None, default=None
diff --git a/atom/basetransformer.py b/atom/basetransformer.py
index 5e2dfee29..859d3f930 100644
--- a/atom/basetransformer.py
+++ b/atom/basetransformer.py
@@ -181,12 +181,12 @@ def backend(self, value: Backend):
 
         elif value == "dask":
             check_dependency("dask")
-            import dask
+            from dask.distributed import Client
 
             try:
-                dask.distributed.Client.current()
+                Client.current()
             except ValueError:
-                dask.distributed.Client(processes=False)
+                Client(processes=False)
 
         joblib.parallel_config(backend=value)
 
@@ -369,7 +369,7 @@ def _device_id(self) -> int:
     @overload
     def _check_input(
         X: XSelector,
-        y: Literal[None] = ...,
+        y: Literal[None],
         *,
         columns: Axes | None = ...,
         name: str | Axes | None = ...,
diff --git a/atom/data/branch.py b/atom/data/branch.py
index fd5710ab4..d2f1f20b4 100644
--- a/atom/data/branch.py
+++ b/atom/data/branch.py
@@ -428,9 +428,9 @@ def shape(self) -> tuple[Int, Int]:
         return self.dataset.shape
 
     @property
-    def columns(self) -> pd.Index:
+    def columns(self) -> list[str]:
         """Name of all the columns."""
-        return self.dataset.columns
+        return list(self.dataset.columns)
 
     @property
     def n_columns(self) -> int:
@@ -438,9 +438,9 @@ def n_columns(self) -> int:
         return len(self.columns)
 
     @property
-    def features(self) -> pd.Index:
+    def features(self) -> list[str]:
         """Name of the features."""
-        return self.columns[:-self._data.n_targets]
+        return list(self.columns[:-self._data.n_targets])
 
     @property
     def n_features(self) -> int:
@@ -460,7 +460,7 @@ def _all(self) -> pd.DataFrame:
         calculation.
 
         """
-        return pd.concat([self.dataset, self.holdout])  # type: ignore[list-item]
+        return pd.concat([self.dataset, self.holdout])
 
     # Utility methods ============================================== >>
 
@@ -580,10 +580,12 @@ def _get_rows(
             # If rows were excluded with `!`, select all but those
             inc = list(_all.index[~_all.index.isin(exc)])
 
+        rows_c = _all.loc[inc]
+
         if return_X_y:
-            return _all.loc[inc, self.features], _all.loc[inc, self.target]  # type: ignore[index]
+            return rows_c[self.features], rows_c[self.target]
         else:
-            return self._all.loc[inc]
+            return rows_c
 
     def _get_columns(
         self,
diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py
index e713844e0..2861c0326 100644
--- a/atom/data_cleaning.py
+++ b/atom/data_cleaning.py
@@ -820,10 +820,10 @@ def fit(self, X: XConstructor | None = None, y: YConstructor | None = None) -> S
             self.target_names_in_ = np.array(get_col_names(yt))
 
             if self.drop_chars:
-                if isinstance(yt, pd.Series):
-                    yt.name = re.sub(self.drop_chars, "", str(yt.name))
-                else:
+                if isinstance(yt, pd.DataFrame):
                     yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1)
+                else:
+                    yt.name = re.sub(self.drop_chars, "", str(yt.name))
 
             if self.drop_missing_target:
                 yt = replace_missing(yt, self.missing_).dropna(axis=0)
diff --git a/atom/utils/utils.py b/atom/utils/utils.py
index 79fe21dd3..10354963c 100644
--- a/atom/utils/utils.py
+++ b/atom/utils/utils.py
@@ -24,15 +24,19 @@
 from types import GeneratorType, MappingProxyType
 from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload
 
+import mlflow
+import nltk
 import numpy as np
 import pandas as pd
+import plotly.graph_objects as go
 import scipy.sparse as sps
 from beartype.door import is_bearable
 from IPython.display import display
+from matplotlib.colors import to_rgba
 from pandas._libs.missing import NAType
 from pandas._typing import Axes, Dtype
 from pandas.api.types import is_numeric_dtype
-from pandas.core.generic import NDFrame
+from shap import Explainer
 from sklearn.base import BaseEstimator
 from sklearn.base import OneToOneFeatureMixin as FMixin
 from sklearn.metrics import (
@@ -55,7 +59,7 @@
 if TYPE_CHECKING:
     from optuna.study import Study
     from optuna.trial import FrozenTrial
-    from shap import Explainer, Explanation
+    from shap import Explanation
 
     from atom.basemodel import BaseModel
     from atom.baserunner import BaseRunner
@@ -63,7 +67,7 @@
 
 
 T = TypeVar("T")
-T_Pandas = TypeVar("T_Pandas", bound=NDFrame)
+T_Pandas = TypeVar("T_Pandas", pd.Series, pd.DataFrame, pd.Series | pd.DataFrame)
 T_Transformer = TypeVar("T_Transformer", bound=Transformer)
 T_Estimator = TypeVar("T_Estimator", bound=Estimator)
 
@@ -633,8 +637,6 @@ def __call__(self, study: Study, trial: FrozenTrial):
 
         # Save trials to mlflow experiment as nested runs
         if self.T.experiment and self.T.log_ht:
-            import mlflow
-
             with mlflow.start_run(run_id=self.T.run.info.run_id):
                 run_name = f"{self.T.name} - {trial.number}"
                 with mlflow.start_run(run_name=run_name, nested=True):
@@ -734,8 +736,6 @@ class PlotCallback:
     max_len = 15  # Maximum trials to show at once in the plot
 
     def __init__(self, name: str, metric: list[str], aesthetics: Aesthetics):
-        import plotly.graph_objects as go
-
         self.y1: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))}
         self.y2: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))}
 
@@ -925,8 +925,6 @@ def explainer(self) -> Explainer:
             Get the initialized explainer object.
 
         """
-        from shap import Explainer
-
         kwargs = {
             "masker": self.branch.X_train,
             "feature_names": list(self.branch.features),
@@ -1286,8 +1284,6 @@ def to_rgb(c: str) -> str:
         Color's RGB representation.
 
     """
-    from matplotlib.colors import to_rgba
-
     if not c.startswith("rgb"):
         colors = to_rgba(c)[:3]
         return f"rgb({colors[0]}, {colors[1]}, {colors[2]})"
@@ -1375,15 +1371,15 @@ def get_nan(dtype: Dtype) -> float | NAType:
     # Always convert these values
     default_values = [None, pd.NA, pd.NaT, np.NaN, np.inf, -np.inf]
 
-    if isinstance(X, pd.Series):
+    if isinstance(X, pd.DataFrame):
         return X.replace(
-            to_replace=(missing_values or []) + default_values,
-            value=get_nan(X.dtype),
+            to_replace={c: (missing_values or []) + default_values for c in X.columns},
+            value={c: get_nan(d) for c, d in X.dtypes.items()},
         )
     else:
         return X.replace(
-            to_replace={c: (missing_values or []) + default_values for c in X.columns},
-            value={c: get_nan(d) for c, d in X.dtypes.items()},
+            to_replace=(missing_values or []) + default_values,
+            value=get_nan(X.dtype),
         )
 
 
@@ -1584,8 +1580,6 @@ def check_nltk_module(module: str, *, quiet: bool):
         Whether to show logs when downloading.
 
     """
-    import nltk
-
     try:
         nltk.data.find(module)
     except LookupError:
diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md
index f464f3795..7dcaa5c61 100644
--- a/docs_sources/dependencies.md
+++ b/docs_sources/dependencies.md
@@ -34,7 +34,7 @@ packages are necessary for its correct functioning.
 * **[ipywidgets](https://pypi.org/project/ipywidgets/)** (>=8.1.1)
 * **[joblib](https://joblib.readthedocs.io/en/latest/)** (>=1.3.1)
 * **[matplotlib](https://matplotlib.org/)** (>=3.7.2)
-* **[mlflow](https://mlflow.org/)** (>=2.7.1)
+* **[mlflow](https://mlflow.org/)** (>=2.10.2)
 * **[nltk](https://www.nltk.org/)** (>=3.8.1)
 * **[numpy](https://numpy.org/)** (>=1.23.0)
 * **[optuna](https://optuna.org/)** (>=3.4.0)
@@ -57,7 +57,7 @@ additional libraries. You can install all the optional dependencies using
 * **[botorch](https://botorch.org/docs/introduction)** (>=0.8.5)
 * **[catboost](https://catboost.ai/docs/concepts/about.html)** (>=1.2)
 * **[dagshub](https://github.com/DagsHub/client)** (>=0.3.8)
-* **[dask](https://dask.org/)** (>=2024.2.0)
+* **[dask[distributed]](https://dask.org/)** (>=2024.2.0)
 * **[explainerdashboard](https://explainerdashboard.readthedocs.io/en/latest/)** (>=0.4.3)
 * **[gradio](https://github.com/gradio-app/gradio)** (>=3.44.4)
 * **[lightgbm](https://lightgbm.readthedocs.io/en/latest/)** (>=4.1.0)
diff --git a/pyproject.toml b/pyproject.toml
index 866f05ed6..981280631 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "ipywidgets>=8.1.1",
     "joblib>=1.3.1",
     "matplotlib>=3.7.2",
-    "mlflow>=2.7.1",
+    "mlflow>=2.10.2",
     "nltk>=3.8.1",
     "numpy>=1.23.0",
     "optuna>=3.4.0",
@@ -48,7 +48,7 @@ full = [
     "botorch>=0.8.5",
     "catboost>=1.2",
     "dagshub>=0.3.8",
-    "dask>=2024.2.0",
+    "dask[distributed]>=2024.2.0",
     "explainerdashboard>=0.4.3",
     "gradio>=3.44.4",
     "lightgbm>=4.1.0",
diff --git a/tests/conftest.py b/tests/conftest.py
index 4e58c2c77..97c7858ba 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,9 +33,7 @@
 
     from _pytest.monkeypatch import MonkeyPatch
 
-    from atom.utils.types import (
-        DataFrame, Pandas, Sequence, XConstructor,
-    )
+    from atom.utils.types import DataFrame, Pandas, Sequence, XConstructor
 
 
 class DummyTransformer(TransformerMixin, BaseEstimator):