From a59a3b51c8c22db50e8bc78fafb4d075b7fa0ed0 Mon Sep 17 00:00:00 2001 From: Marco van den Boom Date: Mon, 26 Feb 2024 19:19:39 +0100 Subject: [PATCH] dataengines final --- atom/_show_versions.py | 27 ++++++--- atom/atom.py | 3 +- atom/basemodel.py | 114 ++++++++++++++++++----------------- atom/basetransformer.py | 8 +-- atom/data/branch.py | 16 ++--- atom/data_cleaning.py | 6 +- atom/utils/utils.py | 30 ++++----- docs_sources/dependencies.md | 4 +- pyproject.toml | 4 +- tests/conftest.py | 4 +- 10 files changed, 113 insertions(+), 103 deletions(-) diff --git a/atom/_show_versions.py b/atom/_show_versions.py index 22e02a07f..56dfcbdc9 100644 --- a/atom/_show_versions.py +++ b/atom/_show_versions.py @@ -20,12 +20,11 @@ "atom", "beartype", "category_encoders", - "dagshub", "dill", + "featuretools", "gplearn", "imblearn", "ipywidgets", - "featuretools", "joblib", "matplotlib", "mlflow", @@ -35,17 +34,31 @@ "optuna", "pandas", "plotly", - "polars", - "pyarrow", - "ray", - "requests", "sklearn", - "sklearnex", # Has no __version__ attribute "scipy", "shap", "sktime", "statsmodels", "zoofs", # Has no __version__ attribute + "botorch", + "catboost", + "dagshub", + "dask[distributed]", + "explainerdashboard", + "gradio", + "lightgbm", + "modin[ray]", + "polars", + "pyarrow", + "pyspark", + "ray[serve]", + "requests", + "sklearnex", + "schemdraw", + "statsforecast", + "sweetviz", + "wordcloud", + "xgboost", ] diff --git a/atom/atom.py b/atom/atom.py index 974838f17..8b6c600de 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -748,12 +748,11 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM **X, train, test: dataframe-like**
Feature set with shape=(n_samples, n_features). - **y: int, str, dict, sequence or dataframe**
+ **y: int, str, sequence or dataframe**
Target column(s) corresponding to `X`. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. diff --git a/atom/basemodel.py b/atom/basemodel.py index 6fb2ed6d4..170940b93 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -15,7 +15,7 @@ from importlib import import_module from logging import Logger from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, overload +from typing import TYPE_CHECKING, Any, Literal, cast, overload from unittest.mock import patch import dill as pickle @@ -274,7 +274,8 @@ def __init__( self._train_idx = len(self.branch._data.train_idx) # Can change for sh and ts if getattr(self, "needs_scaling", None) and not self.branch.check_scaling(): - self.scaler = Scaler(device=self.device, engine=self.engine).fit(self.X_train) + self.scaler = Scaler(device=self.device, engine=self.engine.estimator) + self.scaler.fit(self.X_train) def __repr__(self) -> str: """Display class name.""" @@ -704,7 +705,7 @@ def _get_pred( # Statsmodels models such as SARIMAX and DF require all # exogenous data after the last row of the train set # Other models accept this format - Xe = pd.concat([self.test, self.holdout]) # type: ignore[list-item] + Xe = pd.concat([self.test, self.holdout]) exog = Xe.loc[Xe.index <= X.index.max(), self.features] # type: ignore[index] y_pred = self._prediction( @@ -1680,10 +1681,11 @@ def y(self) -> Pandas: def X_train(self) -> pd.DataFrame: """Features of the training set.""" features = self.branch.features.isin(self._config.ignore) + X_train = self.branch.X_train.iloc[-self._train_idx:, ~features] if self.scaler: - return self.scaler.transform(self.branch.X_train.iloc[-self._train_idx:, ~features]) + return cast(pd.DataFrame, self.scaler.transform(X_train)) else: - return self.branch.X_train.iloc[-self._train_idx:, ~features] + return X_train @property def y_train(self) -> Pandas: @@ -1694,10 +1696,11 @@ def y_train(self) -> Pandas: def X_test(self) -> pd.DataFrame: """Features of the test set.""" features = self.branch.features.isin(self._config.ignore) + X_test = self.branch.X_test.iloc[:, ~features] if self.scaler: - return self.scaler.transform(self.branch.X_test.iloc[:, ~features]) + return cast(pd.DataFrame, self.scaler.transform(X_test)) else: - return self.branch.X_test.iloc[:, ~features] + return X_test @property def X_holdout(self) -> pd.DataFrame | None: @@ -2195,11 +2198,11 @@ def full_train(self, *, include_holdout: Bool = False): if include_holdout and self.holdout is None: raise ValueError("No holdout data set available.") - if include_holdout and self.holdout is not None: + if not include_holdout: + X, y = self.X, self.y + else: X = pd.concat([self.X, self.X_holdout]) y = pd.concat([self.y, self.y_holdout]) - else: - X, y = self.X, self.y # Assign a mlflow run to the new estimator if self.experiment: @@ -2518,17 +2521,6 @@ def get_tags(self) -> dict[str, Any]: "supports_engines": ", ".join(getattr(self, "supports_engines", [])), } - @overload - def _prediction( - self, - X: RowSelector | XSelector, - y: YSelector | None = ..., - metric: str | MetricFunction | Scorer | None = ..., - sample_weight: Sequence[Scalar] | None = ..., - verbose: Verbose | None = ..., - method: Literal["score"] = ..., - ) -> Float: ... - @overload def _prediction( self, @@ -2545,6 +2537,17 @@ def _prediction( ] = ..., ) -> Pandas: ... + @overload + def _prediction( + self, + X: RowSelector | XSelector, + y: YSelector | None, + metric: str | MetricFunction | Scorer | None, + sample_weight: Sequence[Scalar] | None, + verbose: Verbose | None, + method: Literal["score"], + ) -> Float: ... + def _prediction( self, X: RowSelector | XSelector, @@ -2567,13 +2570,12 @@ def _prediction( set with shape=(n_samples, n_features) to make predictions on. - y: int, str, dict, sequence, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - If None: `y` is ignored. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -2603,23 +2605,26 @@ def _prediction( """ - def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[pd.DataFrame, Pandas]: + def get_transform_X_y( + X: RowSelector | XSelector, + y: YSelector | None, + ) -> tuple[pd.DataFrame, Pandas | None]: """Get X and y from the pipeline transformation. Parameters ---------- - X: dataframe-like - Feature set. + X: hashable, segment, sequence or dataframe-like + Feature set. If not dataframe-like, expected to fail. - y: int, str or sequence - Target column(s). + y: int, str, sequence, dataframe-like or None + Target column(s) corresponding to `X`. Returns ------- dataframe Transformed feature set. - series or dataframe + series, dataframe or None Transformed target column. """ @@ -2889,13 +2894,12 @@ def score( set with shape=(n_samples, n_features) to make predictions on. - y: int, str, dict, sequence, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. - If None: `X` must be a selection of rows in the dataset. - If int: Position of the target column in `X`. - If str: Name of the target column in `X`. - - If dict: Name of the target column and sequence of values. - If sequence: Target column with shape=(n_samples,) or sequence of column names or positions for multioutput tasks. @@ -2965,39 +2969,39 @@ def _prediction( X: XSelector | None = ..., metric: str | MetricFunction | Scorer | None = ..., verbose: Verbose | None = ..., - method: Literal["score"] = ..., + method: Literal[ + "predict", + "predict_interval", + "predict_quantiles", + "predict_residuals", + "predict_var", + ] = ..., **kwargs, - ) -> Float: ... + ) -> Pandas: ... @overload def _prediction( self, - fh: RowSelector | FHConstructor | None = ..., - y: RowSelector | YSelector | None = ..., - X: XSelector | None = ..., - metric: str | MetricFunction | Scorer | None = ..., - verbose: Verbose | None = ..., - method: Literal["predict_proba"] = ..., + fh: RowSelector | FHConstructor | None, + y: RowSelector | YSelector | None, + X: XSelector | None, + metric: str | MetricFunction | Scorer | None, + verbose: Verbose | None, + method: Literal["predict_proba"], **kwargs, ) -> Normal: ... @overload def _prediction( self, - fh: RowSelector | FHConstructor | None = ..., - y: RowSelector | YSelector | None = ..., - X: XSelector | None = ..., - metric: str | MetricFunction | Scorer | None = ..., - verbose: Verbose | None = ..., - method: Literal[ - "predict", - "predict_interval", - "predict_quantiles", - "predict_residuals", - "predict_var", - ] = ..., + fh: RowSelector | FHConstructor | None, + y: RowSelector | YSelector | None, + X: XSelector | None, + metric: str | MetricFunction | Scorer | None, + verbose: Verbose | None, + method: Literal["score"], **kwargs, - ) -> Pandas: ... + ) -> Float: ... def _prediction( self, @@ -3021,7 +3025,7 @@ def _prediction( The [forecasting horizon][row-and-column-selection] encoding the time stamps to forecast at. - y: int, str, dict, sequence, dataframe-like or None, default=None + y: int, str, sequence, dataframe-like or None, default=None Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None @@ -3299,7 +3303,7 @@ def predict_residuals( Parameters ---------- - y: int, str, dict, sequence or dataframe + y: int, str, sequence or dataframe Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None @@ -3397,7 +3401,7 @@ def score( Parameters ---------- - y: int, str, dict, sequence or dataframe-like + y: int, str, sequence or dataframe-like Ground truth observations. X: hashable, segment, sequence, dataframe-like or None, default=None diff --git a/atom/basetransformer.py b/atom/basetransformer.py index 5e2dfee29..859d3f930 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -181,12 +181,12 @@ def backend(self, value: Backend): elif value == "dask": check_dependency("dask") - import dask + from dask.distributed import Client try: - dask.distributed.Client.current() + Client.current() except ValueError: - dask.distributed.Client(processes=False) + Client(processes=False) joblib.parallel_config(backend=value) @@ -369,7 +369,7 @@ def _device_id(self) -> int: @overload def _check_input( X: XSelector, - y: Literal[None] = ..., + y: Literal[None], *, columns: Axes | None = ..., name: str | Axes | None = ..., diff --git a/atom/data/branch.py b/atom/data/branch.py index fd5710ab4..d2f1f20b4 100644 --- a/atom/data/branch.py +++ b/atom/data/branch.py @@ -428,9 +428,9 @@ def shape(self) -> tuple[Int, Int]: return self.dataset.shape @property - def columns(self) -> pd.Index: + def columns(self) -> list[str]: """Name of all the columns.""" - return self.dataset.columns + return list(self.dataset.columns) @property def n_columns(self) -> int: @@ -438,9 +438,9 @@ def n_columns(self) -> int: return len(self.columns) @property - def features(self) -> pd.Index: + def features(self) -> list[str]: """Name of the features.""" - return self.columns[:-self._data.n_targets] + return list(self.columns[:-self._data.n_targets]) @property def n_features(self) -> int: @@ -460,7 +460,7 @@ def _all(self) -> pd.DataFrame: calculation. """ - return pd.concat([self.dataset, self.holdout]) # type: ignore[list-item] + return pd.concat([self.dataset, self.holdout]) # Utility methods ============================================== >> @@ -580,10 +580,12 @@ def _get_rows( # If rows were excluded with `!`, select all but those inc = list(_all.index[~_all.index.isin(exc)]) + rows_c = _all.loc[inc] + if return_X_y: - return _all.loc[inc, self.features], _all.loc[inc, self.target] # type: ignore[index] + return rows_c[self.features], rows_c[self.target] else: - return self._all.loc[inc] + return rows_c def _get_columns( self, diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index e713844e0..2861c0326 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -820,10 +820,10 @@ def fit(self, X: XConstructor | None = None, y: YConstructor | None = None) -> S self.target_names_in_ = np.array(get_col_names(yt)) if self.drop_chars: - if isinstance(yt, pd.Series): - yt.name = re.sub(self.drop_chars, "", str(yt.name)) - else: + if isinstance(yt, pd.DataFrame): yt = yt.rename(lambda x: re.sub(self.drop_chars, "", str(x)), axis=1) + else: + yt.name = re.sub(self.drop_chars, "", str(yt.name)) if self.drop_missing_target: yt = replace_missing(yt, self.missing_).dropna(axis=0) diff --git a/atom/utils/utils.py b/atom/utils/utils.py index 79fe21dd3..10354963c 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -24,15 +24,19 @@ from types import GeneratorType, MappingProxyType from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload +import mlflow +import nltk import numpy as np import pandas as pd +import plotly.graph_objects as go import scipy.sparse as sps from beartype.door import is_bearable from IPython.display import display +from matplotlib.colors import to_rgba from pandas._libs.missing import NAType from pandas._typing import Axes, Dtype from pandas.api.types import is_numeric_dtype -from pandas.core.generic import NDFrame +from shap import Explainer from sklearn.base import BaseEstimator from sklearn.base import OneToOneFeatureMixin as FMixin from sklearn.metrics import ( @@ -55,7 +59,7 @@ if TYPE_CHECKING: from optuna.study import Study from optuna.trial import FrozenTrial - from shap import Explainer, Explanation + from shap import Explanation from atom.basemodel import BaseModel from atom.baserunner import BaseRunner @@ -63,7 +67,7 @@ T = TypeVar("T") -T_Pandas = TypeVar("T_Pandas", bound=NDFrame) +T_Pandas = TypeVar("T_Pandas", pd.Series, pd.DataFrame, pd.Series | pd.DataFrame) T_Transformer = TypeVar("T_Transformer", bound=Transformer) T_Estimator = TypeVar("T_Estimator", bound=Estimator) @@ -633,8 +637,6 @@ def __call__(self, study: Study, trial: FrozenTrial): # Save trials to mlflow experiment as nested runs if self.T.experiment and self.T.log_ht: - import mlflow - with mlflow.start_run(run_id=self.T.run.info.run_id): run_name = f"{self.T.name} - {trial.number}" with mlflow.start_run(run_name=run_name, nested=True): @@ -734,8 +736,6 @@ class PlotCallback: max_len = 15 # Maximum trials to show at once in the plot def __init__(self, name: str, metric: list[str], aesthetics: Aesthetics): - import plotly.graph_objects as go - self.y1: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))} self.y2: dict[int, deque] = {i: deque(maxlen=self.max_len) for i in range(len(metric))} @@ -925,8 +925,6 @@ def explainer(self) -> Explainer: Get the initialized explainer object. """ - from shap import Explainer - kwargs = { "masker": self.branch.X_train, "feature_names": list(self.branch.features), @@ -1286,8 +1284,6 @@ def to_rgb(c: str) -> str: Color's RGB representation. """ - from matplotlib.colors import to_rgba - if not c.startswith("rgb"): colors = to_rgba(c)[:3] return f"rgb({colors[0]}, {colors[1]}, {colors[2]})" @@ -1375,15 +1371,15 @@ def get_nan(dtype: Dtype) -> float | NAType: # Always convert these values default_values = [None, pd.NA, pd.NaT, np.NaN, np.inf, -np.inf] - if isinstance(X, pd.Series): + if isinstance(X, pd.DataFrame): return X.replace( - to_replace=(missing_values or []) + default_values, - value=get_nan(X.dtype), + to_replace={c: (missing_values or []) + default_values for c in X.columns}, + value={c: get_nan(d) for c, d in X.dtypes.items()}, ) else: return X.replace( - to_replace={c: (missing_values or []) + default_values for c in X.columns}, - value={c: get_nan(d) for c, d in X.dtypes.items()}, + to_replace=(missing_values or []) + default_values, + value=get_nan(X.dtype), ) @@ -1584,8 +1580,6 @@ def check_nltk_module(module: str, *, quiet: bool): Whether to show logs when downloading. """ - import nltk - try: nltk.data.find(module) except LookupError: diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md index f464f3795..7dcaa5c61 100644 --- a/docs_sources/dependencies.md +++ b/docs_sources/dependencies.md @@ -34,7 +34,7 @@ packages are necessary for its correct functioning. * **[ipywidgets](https://pypi.org/project/ipywidgets/)** (>=8.1.1) * **[joblib](https://joblib.readthedocs.io/en/latest/)** (>=1.3.1) * **[matplotlib](https://matplotlib.org/)** (>=3.7.2) -* **[mlflow](https://mlflow.org/)** (>=2.7.1) +* **[mlflow](https://mlflow.org/)** (>=2.10.2) * **[nltk](https://www.nltk.org/)** (>=3.8.1) * **[numpy](https://numpy.org/)** (>=1.23.0) * **[optuna](https://optuna.org/)** (>=3.4.0) @@ -57,7 +57,7 @@ additional libraries. You can install all the optional dependencies using * **[botorch](https://botorch.org/docs/introduction)** (>=0.8.5) * **[catboost](https://catboost.ai/docs/concepts/about.html)** (>=1.2) * **[dagshub](https://github.com/DagsHub/client)** (>=0.3.8) -* **[dask](https://dask.org/)** (>=2024.2.0) +* **[dask[distributed]](https://dask.org/)** (>=2024.2.0) * **[explainerdashboard](https://explainerdashboard.readthedocs.io/en/latest/)** (>=0.4.3) * **[gradio](https://github.com/gradio-app/gradio)** (>=3.44.4) * **[lightgbm](https://lightgbm.readthedocs.io/en/latest/)** (>=4.1.0) diff --git a/pyproject.toml b/pyproject.toml index 866f05ed6..981280631 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "ipywidgets>=8.1.1", "joblib>=1.3.1", "matplotlib>=3.7.2", - "mlflow>=2.7.1", + "mlflow>=2.10.2", "nltk>=3.8.1", "numpy>=1.23.0", "optuna>=3.4.0", @@ -48,7 +48,7 @@ full = [ "botorch>=0.8.5", "catboost>=1.2", "dagshub>=0.3.8", - "dask>=2024.2.0", + "dask[distributed]>=2024.2.0", "explainerdashboard>=0.4.3", "gradio>=3.44.4", "lightgbm>=4.1.0", diff --git a/tests/conftest.py b/tests/conftest.py index 4e58c2c77..97c7858ba 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,9 +33,7 @@ from _pytest.monkeypatch import MonkeyPatch - from atom.utils.types import ( - DataFrame, Pandas, Sequence, XConstructor, - ) + from atom.utils.types import DataFrame, Pandas, Sequence, XConstructor class DummyTransformer(TransformerMixin, BaseEstimator):