diff --git a/atom/_show_versions.py b/atom/_show_versions.py
index 22e02a07f..56dfcbdc9 100644
--- a/atom/_show_versions.py
+++ b/atom/_show_versions.py
@@ -20,12 +20,11 @@
"atom",
"beartype",
"category_encoders",
- "dagshub",
"dill",
+ "featuretools",
"gplearn",
"imblearn",
"ipywidgets",
- "featuretools",
"joblib",
"matplotlib",
"mlflow",
@@ -35,17 +34,31 @@
"optuna",
"pandas",
"plotly",
- "polars",
- "pyarrow",
- "ray",
- "requests",
"sklearn",
- "sklearnex", # Has no __version__ attribute
"scipy",
"shap",
"sktime",
"statsmodels",
"zoofs", # Has no __version__ attribute
+ "botorch",
+ "catboost",
+ "dagshub",
+ "dask[distributed]",
+ "explainerdashboard",
+ "gradio",
+ "lightgbm",
+ "modin[ray]",
+ "polars",
+ "pyarrow",
+ "pyspark",
+ "ray[serve]",
+ "requests",
+ "sklearnex",
+ "schemdraw",
+ "statsforecast",
+ "sweetviz",
+ "wordcloud",
+ "xgboost",
]
diff --git a/atom/atom.py b/atom/atom.py
index 974838f17..8b6c600de 100644
--- a/atom/atom.py
+++ b/atom/atom.py
@@ -748,12 +748,11 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM
**X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features).
- **y: int, str, dict, sequence or dataframe**
+ **y: int, str, sequence or dataframe**
Target column(s) corresponding to `X`.
- If int: Position of the target column in `X`.
- If str: Name of the target column in `X`.
- - If dict: Name of the target column and sequence of values.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput
tasks.
diff --git a/atom/basemodel.py b/atom/basemodel.py
index 6fb2ed6d4..170940b93 100644
--- a/atom/basemodel.py
+++ b/atom/basemodel.py
@@ -15,7 +15,7 @@
from importlib import import_module
from logging import Logger
from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal, overload
+from typing import TYPE_CHECKING, Any, Literal, cast, overload
from unittest.mock import patch
import dill as pickle
@@ -274,7 +274,8 @@ def __init__(
self._train_idx = len(self.branch._data.train_idx) # Can change for sh and ts
if getattr(self, "needs_scaling", None) and not self.branch.check_scaling():
- self.scaler = Scaler(device=self.device, engine=self.engine).fit(self.X_train)
+ self.scaler = Scaler(device=self.device, engine=self.engine.estimator)
+ self.scaler.fit(self.X_train)
def __repr__(self) -> str:
"""Display class name."""
@@ -704,7 +705,7 @@ def _get_pred(
# Statsmodels models such as SARIMAX and DF require all
# exogenous data after the last row of the train set
# Other models accept this format
- Xe = pd.concat([self.test, self.holdout]) # type: ignore[list-item]
+ Xe = pd.concat([self.test, self.holdout])
exog = Xe.loc[Xe.index <= X.index.max(), self.features] # type: ignore[index]
y_pred = self._prediction(
@@ -1680,10 +1681,11 @@ def y(self) -> Pandas:
def X_train(self) -> pd.DataFrame:
"""Features of the training set."""
features = self.branch.features.isin(self._config.ignore)
+ X_train = self.branch.X_train.iloc[-self._train_idx:, ~features]
if self.scaler:
- return self.scaler.transform(self.branch.X_train.iloc[-self._train_idx:, ~features])
+ return cast(pd.DataFrame, self.scaler.transform(X_train))
else:
- return self.branch.X_train.iloc[-self._train_idx:, ~features]
+ return X_train
@property
def y_train(self) -> Pandas:
@@ -1694,10 +1696,11 @@ def y_train(self) -> Pandas:
def X_test(self) -> pd.DataFrame:
"""Features of the test set."""
features = self.branch.features.isin(self._config.ignore)
+ X_test = self.branch.X_test.iloc[:, ~features]
if self.scaler:
- return self.scaler.transform(self.branch.X_test.iloc[:, ~features])
+ return cast(pd.DataFrame, self.scaler.transform(X_test))
else:
- return self.branch.X_test.iloc[:, ~features]
+ return X_test
@property
def X_holdout(self) -> pd.DataFrame | None:
@@ -2195,11 +2198,11 @@ def full_train(self, *, include_holdout: Bool = False):
if include_holdout and self.holdout is None:
raise ValueError("No holdout data set available.")
- if include_holdout and self.holdout is not None:
+ if not include_holdout:
+ X, y = self.X, self.y
+ else:
X = pd.concat([self.X, self.X_holdout])
y = pd.concat([self.y, self.y_holdout])
- else:
- X, y = self.X, self.y
# Assign a mlflow run to the new estimator
if self.experiment:
@@ -2518,17 +2521,6 @@ def get_tags(self) -> dict[str, Any]:
"supports_engines": ", ".join(getattr(self, "supports_engines", [])),
}
- @overload
- def _prediction(
- self,
- X: RowSelector | XSelector,
- y: YSelector | None = ...,
- metric: str | MetricFunction | Scorer | None = ...,
- sample_weight: Sequence[Scalar] | None = ...,
- verbose: Verbose | None = ...,
- method: Literal["score"] = ...,
- ) -> Float: ...
-
@overload
def _prediction(
self,
@@ -2545,6 +2537,17 @@ def _prediction(
] = ...,
) -> Pandas: ...
+ @overload
+ def _prediction(
+ self,
+ X: RowSelector | XSelector,
+ y: YSelector | None,
+ metric: str | MetricFunction | Scorer | None,
+ sample_weight: Sequence[Scalar] | None,
+ verbose: Verbose | None,
+ method: Literal["score"],
+ ) -> Float: ...
+
def _prediction(
self,
X: RowSelector | XSelector,
@@ -2567,13 +2570,12 @@ def _prediction(
set with shape=(n_samples, n_features) to make predictions
on.
- y: int, str, dict, sequence, dataframe-like or None, default=None
+ y: int, str, sequence, dataframe-like or None, default=None
Target column(s) corresponding to `X`.
- If None: `y` is ignored.
- If int: Position of the target column in `X`.
- If str: Name of the target column in `X`.
- - If dict: Name of the target column and sequence of values.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput
tasks.
@@ -2603,23 +2605,26 @@ def _prediction(
"""
- def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[pd.DataFrame, Pandas]:
+ def get_transform_X_y(
+ X: RowSelector | XSelector,
+ y: YSelector | None,
+ ) -> tuple[pd.DataFrame, Pandas | None]:
"""Get X and y from the pipeline transformation.
Parameters
----------
- X: dataframe-like
- Feature set.
+ X: hashable, segment, sequence or dataframe-like
+ Feature set. If not dataframe-like, expected to fail.
- y: int, str or sequence
- Target column(s).
+ y: int, str, sequence, dataframe-like or None
+ Target column(s) corresponding to `X`.
Returns
-------
dataframe
Transformed feature set.
- series or dataframe
+ series, dataframe or None
Transformed target column.
"""
@@ -2889,13 +2894,12 @@ def score(
set with shape=(n_samples, n_features) to make predictions
on.
- y: int, str, dict, sequence, dataframe-like or None, default=None
+ y: int, str, sequence, dataframe-like or None, default=None
Target column(s) corresponding to `X`.
- If None: `X` must be a selection of rows in the dataset.
- If int: Position of the target column in `X`.
- If str: Name of the target column in `X`.
- - If dict: Name of the target column and sequence of values.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput
tasks.
@@ -2965,39 +2969,39 @@ def _prediction(
X: XSelector | None = ...,
metric: str | MetricFunction | Scorer | None = ...,
verbose: Verbose | None = ...,
- method: Literal["score"] = ...,
+ method: Literal[
+ "predict",
+ "predict_interval",
+ "predict_quantiles",
+ "predict_residuals",
+ "predict_var",
+ ] = ...,
**kwargs,
- ) -> Float: ...
+ ) -> Pandas: ...
@overload
def _prediction(
self,
- fh: RowSelector | FHConstructor | None = ...,
- y: RowSelector | YSelector | None = ...,
- X: XSelector | None = ...,
- metric: str | MetricFunction | Scorer | None = ...,
- verbose: Verbose | None = ...,
- method: Literal["predict_proba"] = ...,
+ fh: RowSelector | FHConstructor | None,
+ y: RowSelector | YSelector | None,
+ X: XSelector | None,
+ metric: str | MetricFunction | Scorer | None,
+ verbose: Verbose | None,
+ method: Literal["predict_proba"],
**kwargs,
) -> Normal: ...
@overload
def _prediction(
self,
- fh: RowSelector | FHConstructor | None = ...,
- y: RowSelector | YSelector | None = ...,
- X: XSelector | None = ...,
- metric: str | MetricFunction | Scorer | None = ...,
- verbose: Verbose | None = ...,
- method: Literal[
- "predict",
- "predict_interval",
- "predict_quantiles",
- "predict_residuals",
- "predict_var",
- ] = ...,
+ fh: RowSelector | FHConstructor | None,
+ y: RowSelector | YSelector | None,
+ X: XSelector | None,
+ metric: str | MetricFunction | Scorer | None,
+ verbose: Verbose | None,
+ method: Literal["score"],
**kwargs,
- ) -> Pandas: ...
+ ) -> Float: ...
def _prediction(
self,
@@ -3021,7 +3025,7 @@ def _prediction(
The [forecasting horizon][row-and-column-selection] encoding
the time stamps to forecast at.
- y: int, str, dict, sequence, dataframe-like or None, default=None
+ y: int, str, sequence, dataframe-like or None, default=None
Ground truth observations.
X: hashable, segment, sequence, dataframe-like or None, default=None
@@ -3299,7 +3303,7 @@ def predict_residuals(
Parameters
----------
- y: int, str, dict, sequence or dataframe
+ y: int, str, sequence or dataframe
Ground truth observations.
X: hashable, segment, sequence, dataframe-like or None, default=None
@@ -3397,7 +3401,7 @@ def score(
Parameters
----------
- y: int, str, dict, sequence or dataframe-like
+ y: int, str, sequence or dataframe-like
Ground truth observations.
X: hashable, segment, sequence, dataframe-like or None, default=None
diff --git a/atom/basetransformer.py b/atom/basetransformer.py
index 5e2dfee29..859d3f930 100644
--- a/atom/basetransformer.py
+++ b/atom/basetransformer.py
@@ -181,12 +181,12 @@ def backend(self, value: Backend):
elif value == "dask":
check_dependency("dask")
- import dask
+ from dask.distributed import Client
try:
- dask.distributed.Client.current()
+ Client.current()
except ValueError:
- dask.distributed.Client(processes=False)
+ Client(processes=False)
joblib.parallel_config(backend=value)
@@ -369,7 +369,7 @@ def _device_id(self) -> int:
@overload
def _check_input(
X: XSelector,
- y: Literal[None] = ...,
+ y: Literal[None],
*,
columns: Axes | None = ...,
name: str | Axes | None = ...,
diff --git a/atom/data/branch.py b/atom/data/branch.py
index fd5710ab4..d2f1f20b4 100644
--- a/atom/data/branch.py
+++ b/atom/data/branch.py
@@ -428,9 +428,9 @@ def shape(self) -> tuple[Int, Int]:
return self.dataset.shape
@property
- def columns(self) -> pd.Index:
+ def columns(self) -> list[str]:
"""Name of all the columns."""
- return self.dataset.columns
+ return list(self.dataset.columns)
@property
def n_columns(self) -> int:
@@ -438,9 +438,9 @@ def n_columns(self) -> int:
return len(self.columns)
@property
- def features(self) -> pd.Index:
+ def features(self) -> list[str]:
"""Name of the features."""
- return self.columns[:-self._data.n_targets]
+ return list(self.columns[:-self._data.n_targets])
@property
def n_features(self) -> int:
@@ -460,7 +460,7 @@ def _all(self) -> pd.DataFrame:
calculation.
"""
- return pd.concat([self.dataset, self.holdout]) # type: ignore[list-item]
+ return pd.concat([self.dataset, self.holdout])
# Utility methods ============================================== >>
@@ -580,10 +580,12 @@ def _get_rows(
# If rows were excluded with `!`, select all but those
inc = list(_all.index[~_all.index.isin(exc)])
+ rows_c = _all.loc[inc]
+
if return_X_y:
- return _all.loc[inc, self.features], _all.loc[inc, self.target] # type: ignore[index]
+ return rows_c[self.features], rows_c[self.target]
else:
- return self._all.loc[inc]
+ return rows_c
def _get_columns(
self,
diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py
index e713844e0..2861c0326 100644
--- a/atom/data_cleaning.py
+++ b/atom/data_cleaning.py
@@ -820,10 +820,10 @@ def fit(self, X: XConstructor | None = None, y: YConstructor | None = None) -> S
self.target_names_in_ = np.array(get_col_names(yt))
if self.drop_chars:
- if isinstance(yt, pd.Series):
- yt.name = re.sub(self.drop_chars, "", str(yt.name))
- else:
+ if isinstance(yt, pd.DataFrame):
yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1)
+ else:
+ yt.name = re.sub(self.drop_chars, "", str(yt.name))
if self.drop_missing_target:
yt = replace_missing(yt, self.missing_).dropna(axis=0)
diff --git a/atom/utils/utils.py b/atom/utils/utils.py
index 79fe21dd3..10354963c 100644
--- a/atom/utils/utils.py
+++ b/atom/utils/utils.py
@@ -24,15 +24,19 @@
from types import GeneratorType, MappingProxyType
from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload
+import mlflow
+import nltk
import numpy as np
import pandas as pd
+import plotly.graph_objects as go
import scipy.sparse as sps
from beartype.door import is_bearable
from IPython.display import display
+from matplotlib.colors import to_rgba
from pandas._libs.missing import NAType
from pandas._typing import Axes, Dtype
from pandas.api.types import is_numeric_dtype
-from pandas.core.generic import NDFrame
+from shap import Explainer
from sklearn.base import BaseEstimator
from sklearn.base import OneToOneFeatureMixin as FMixin
from sklearn.metrics import (
@@ -55,7 +59,7 @@
if TYPE_CHECKING:
from optuna.study import Study
from optuna.trial import FrozenTrial
- from shap import Explainer, Explanation
+ from shap import Explanation
from atom.basemodel import BaseModel
from atom.baserunner import BaseRunner
@@ -63,7 +67,7 @@
T = TypeVar("T")
-T_Pandas = TypeVar("T_Pandas", bound=NDFrame)
+T_Pandas = TypeVar("T_Pandas", pd.Series, pd.DataFrame, pd.Series | pd.DataFrame)
T_Transformer = TypeVar("T_Transformer", bound=Transformer)
T_Estimator = TypeVar("T_Estimator", bound=Estimator)
@@ -633,8 +637,6 @@ def __call__(self, study: Study, trial: FrozenTrial):
# Save trials to mlflow experiment as nested runs
if self.T.experiment and self.T.log_ht:
- import mlflow
-
with mlflow.start_run(run_id=self.T.run.info.run_id):
run_name = f"{self.T.name} - {trial.number}"
with mlflow.start_run(run_name=run_name, nested=True):
@@ -734,8 +736,6 @@ class PlotCallback:
max_len = 15 # Maximum trials to show at once in the plot
def __init__(self, name: str, metric: list[str], aesthetics: Aesthetics):
- import plotly.graph_objects as go
-
self.y1: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))}
self.y2: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))}
@@ -925,8 +925,6 @@ def explainer(self) -> Explainer:
Get the initialized explainer object.
"""
- from shap import Explainer
-
kwargs = {
"masker": self.branch.X_train,
"feature_names": list(self.branch.features),
@@ -1286,8 +1284,6 @@ def to_rgb(c: str) -> str:
Color's RGB representation.
"""
- from matplotlib.colors import to_rgba
-
if not c.startswith("rgb"):
colors = to_rgba(c)[:3]
return f"rgb({colors[0]}, {colors[1]}, {colors[2]})"
@@ -1375,15 +1371,15 @@ def get_nan(dtype: Dtype) -> float | NAType:
# Always convert these values
default_values = [None, pd.NA, pd.NaT, np.NaN, np.inf, -np.inf]
- if isinstance(X, pd.Series):
+ if isinstance(X, pd.DataFrame):
return X.replace(
- to_replace=(missing_values or []) + default_values,
- value=get_nan(X.dtype),
+ to_replace={c: (missing_values or []) + default_values for c in X.columns},
+ value={c: get_nan(d) for c, d in X.dtypes.items()},
)
else:
return X.replace(
- to_replace={c: (missing_values or []) + default_values for c in X.columns},
- value={c: get_nan(d) for c, d in X.dtypes.items()},
+ to_replace=(missing_values or []) + default_values,
+ value=get_nan(X.dtype),
)
@@ -1584,8 +1580,6 @@ def check_nltk_module(module: str, *, quiet: bool):
Whether to show logs when downloading.
"""
- import nltk
-
try:
nltk.data.find(module)
except LookupError:
diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md
index f464f3795..7dcaa5c61 100644
--- a/docs_sources/dependencies.md
+++ b/docs_sources/dependencies.md
@@ -34,7 +34,7 @@ packages are necessary for its correct functioning.
* **[ipywidgets](https://pypi.org/project/ipywidgets/)** (>=8.1.1)
* **[joblib](https://joblib.readthedocs.io/en/latest/)** (>=1.3.1)
* **[matplotlib](https://matplotlib.org/)** (>=3.7.2)
-* **[mlflow](https://mlflow.org/)** (>=2.7.1)
+* **[mlflow](https://mlflow.org/)** (>=2.10.2)
* **[nltk](https://www.nltk.org/)** (>=3.8.1)
* **[numpy](https://numpy.org/)** (>=1.23.0)
* **[optuna](https://optuna.org/)** (>=3.4.0)
@@ -57,7 +57,7 @@ additional libraries. You can install all the optional dependencies using
* **[botorch](https://botorch.org/docs/introduction)** (>=0.8.5)
* **[catboost](https://catboost.ai/docs/concepts/about.html)** (>=1.2)
* **[dagshub](https://github.com/DagsHub/client)** (>=0.3.8)
-* **[dask](https://dask.org/)** (>=2024.2.0)
+* **[dask[distributed]](https://dask.org/)** (>=2024.2.0)
* **[explainerdashboard](https://explainerdashboard.readthedocs.io/en/latest/)** (>=0.4.3)
* **[gradio](https://github.com/gradio-app/gradio)** (>=3.44.4)
* **[lightgbm](https://lightgbm.readthedocs.io/en/latest/)** (>=4.1.0)
diff --git a/pyproject.toml b/pyproject.toml
index 866f05ed6..981280631 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
"ipywidgets>=8.1.1",
"joblib>=1.3.1",
"matplotlib>=3.7.2",
- "mlflow>=2.7.1",
+ "mlflow>=2.10.2",
"nltk>=3.8.1",
"numpy>=1.23.0",
"optuna>=3.4.0",
@@ -48,7 +48,7 @@ full = [
"botorch>=0.8.5",
"catboost>=1.2",
"dagshub>=0.3.8",
- "dask>=2024.2.0",
+ "dask[distributed]>=2024.2.0",
"explainerdashboard>=0.4.3",
"gradio>=3.44.4",
"lightgbm>=4.1.0",
diff --git a/tests/conftest.py b/tests/conftest.py
index 4e58c2c77..97c7858ba 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,9 +33,7 @@
from _pytest.monkeypatch import MonkeyPatch
- from atom.utils.types import (
- DataFrame, Pandas, Sequence, XConstructor,
- )
+ from atom.utils.types import DataFrame, Pandas, Sequence, XConstructor
class DummyTransformer(TransformerMixin, BaseEstimator):