Skip to content

Commit

Permalink
dataengines final
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcovdBoom committed Feb 26, 2024
1 parent c3d9c6b commit a59a3b5
Show file tree
Hide file tree
Showing 10 changed files with 113 additions and 103 deletions.
27 changes: 20 additions & 7 deletions atom/_show_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,11 @@
"atom",
"beartype",
"category_encoders",
"dagshub",
"dill",
"featuretools",
"gplearn",
"imblearn",
"ipywidgets",
"featuretools",
"joblib",
"matplotlib",
"mlflow",
Expand All @@ -35,17 +34,31 @@
"optuna",
"pandas",
"plotly",
"polars",
"pyarrow",
"ray",
"requests",
"sklearn",
"sklearnex", # Has no __version__ attribute
"scipy",
"shap",
"sktime",
"statsmodels",
"zoofs", # Has no __version__ attribute
"botorch",
"catboost",
"dagshub",
"dask[distributed]",
"explainerdashboard",
"gradio",
"lightgbm",
"modin[ray]",
"polars",
"pyarrow",
"pyspark",
"ray[serve]",
"requests",
"sklearnex",
"schemdraw",
"statsforecast",
"sweetviz",
"wordcloud",
"xgboost",
]


Expand Down
3 changes: 1 addition & 2 deletions atom/atom.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,12 +748,11 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM
**X, train, test: dataframe-like**<br>
Feature set with shape=(n_samples, n_features).
**y: int, str, dict, sequence or dataframe**<br>
**y: int, str, sequence or dataframe**<br>
Target column(s) corresponding to `X`.
- If int: Position of the target column in `X`.
- If str: Name of the target column in `X`.
- If dict: Name of the target column and sequence of values.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput
tasks.
Expand Down
114 changes: 59 additions & 55 deletions atom/basemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from importlib import import_module
from logging import Logger
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal, overload
from typing import TYPE_CHECKING, Any, Literal, cast, overload
from unittest.mock import patch

import dill as pickle
Expand Down Expand Up @@ -274,7 +274,8 @@ def __init__(
self._train_idx = len(self.branch._data.train_idx) # Can change for sh and ts

if getattr(self, "needs_scaling", None) and not self.branch.check_scaling():
self.scaler = Scaler(device=self.device, engine=self.engine).fit(self.X_train)
self.scaler = Scaler(device=self.device, engine=self.engine.estimator)
self.scaler.fit(self.X_train)

def __repr__(self) -> str:
"""Display class name."""
Expand Down Expand Up @@ -704,7 +705,7 @@ def _get_pred(
# Statsmodels models such as SARIMAX and DF require all
# exogenous data after the last row of the train set
# Other models accept this format
Xe = pd.concat([self.test, self.holdout]) # type: ignore[list-item]
Xe = pd.concat([self.test, self.holdout])
exog = Xe.loc[Xe.index <= X.index.max(), self.features] # type: ignore[index]

y_pred = self._prediction(
Expand Down Expand Up @@ -1680,10 +1681,11 @@ def y(self) -> Pandas:
def X_train(self) -> pd.DataFrame:
"""Features of the training set."""
features = self.branch.features.isin(self._config.ignore)
X_train = self.branch.X_train.iloc[-self._train_idx:, ~features]
if self.scaler:
return self.scaler.transform(self.branch.X_train.iloc[-self._train_idx:, ~features])
return cast(pd.DataFrame, self.scaler.transform(X_train))
else:
return self.branch.X_train.iloc[-self._train_idx:, ~features]
return X_train

@property
def y_train(self) -> Pandas:
Expand All @@ -1694,10 +1696,11 @@ def y_train(self) -> Pandas:
def X_test(self) -> pd.DataFrame:
"""Features of the test set."""
features = self.branch.features.isin(self._config.ignore)
X_test = self.branch.X_test.iloc[:, ~features]
if self.scaler:
return self.scaler.transform(self.branch.X_test.iloc[:, ~features])
return cast(pd.DataFrame, self.scaler.transform(X_test))
else:
return self.branch.X_test.iloc[:, ~features]
return X_test

@property
def X_holdout(self) -> pd.DataFrame | None:
Expand Down Expand Up @@ -2195,11 +2198,11 @@ def full_train(self, *, include_holdout: Bool = False):
if include_holdout and self.holdout is None:
raise ValueError("No holdout data set available.")

if include_holdout and self.holdout is not None:
if not include_holdout:
X, y = self.X, self.y
else:
X = pd.concat([self.X, self.X_holdout])
y = pd.concat([self.y, self.y_holdout])
else:
X, y = self.X, self.y

# Assign a mlflow run to the new estimator
if self.experiment:
Expand Down Expand Up @@ -2518,17 +2521,6 @@ def get_tags(self) -> dict[str, Any]:
"supports_engines": ", ".join(getattr(self, "supports_engines", [])),
}

@overload
def _prediction(
self,
X: RowSelector | XSelector,
y: YSelector | None = ...,
metric: str | MetricFunction | Scorer | None = ...,
sample_weight: Sequence[Scalar] | None = ...,
verbose: Verbose | None = ...,
method: Literal["score"] = ...,
) -> Float: ...

@overload
def _prediction(
self,
Expand All @@ -2545,6 +2537,17 @@ def _prediction(
] = ...,
) -> Pandas: ...

@overload
def _prediction(
self,
X: RowSelector | XSelector,
y: YSelector | None,
metric: str | MetricFunction | Scorer | None,
sample_weight: Sequence[Scalar] | None,
verbose: Verbose | None,
method: Literal["score"],
) -> Float: ...

def _prediction(
self,
X: RowSelector | XSelector,
Expand All @@ -2567,13 +2570,12 @@ def _prediction(
set with shape=(n_samples, n_features) to make predictions
on.
y: int, str, dict, sequence, dataframe-like or None, default=None
y: int, str, sequence, dataframe-like or None, default=None
Target column(s) corresponding to `X`.
- If None: `y` is ignored.
- If int: Position of the target column in `X`.
- If str: Name of the target column in `X`.
- If dict: Name of the target column and sequence of values.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput
tasks.
Expand Down Expand Up @@ -2603,23 +2605,26 @@ def _prediction(
"""

def get_transform_X_y(X: XSelector, y: YSelector) -> tuple[pd.DataFrame, Pandas]:
def get_transform_X_y(
X: RowSelector | XSelector,
y: YSelector | None,
) -> tuple[pd.DataFrame, Pandas | None]:
"""Get X and y from the pipeline transformation.
Parameters
----------
X: dataframe-like
Feature set.
X: hashable, segment, sequence or dataframe-like
Feature set. If not dataframe-like, expected to fail.
y: int, str or sequence
Target column(s).
y: int, str, sequence, dataframe-like or None
Target column(s) corresponding to `X`.
Returns
-------
dataframe
Transformed feature set.
series or dataframe
series, dataframe or None
Transformed target column.
"""
Expand Down Expand Up @@ -2889,13 +2894,12 @@ def score(
set with shape=(n_samples, n_features) to make predictions
on.
y: int, str, dict, sequence, dataframe-like or None, default=None
y: int, str, sequence, dataframe-like or None, default=None
Target column(s) corresponding to `X`.
- If None: `X` must be a selection of rows in the dataset.
- If int: Position of the target column in `X`.
- If str: Name of the target column in `X`.
- If dict: Name of the target column and sequence of values.
- If sequence: Target column with shape=(n_samples,) or
sequence of column names or positions for multioutput
tasks.
Expand Down Expand Up @@ -2965,39 +2969,39 @@ def _prediction(
X: XSelector | None = ...,
metric: str | MetricFunction | Scorer | None = ...,
verbose: Verbose | None = ...,
method: Literal["score"] = ...,
method: Literal[
"predict",
"predict_interval",
"predict_quantiles",
"predict_residuals",
"predict_var",
] = ...,
**kwargs,
) -> Float: ...
) -> Pandas: ...

@overload
def _prediction(
self,
fh: RowSelector | FHConstructor | None = ...,
y: RowSelector | YSelector | None = ...,
X: XSelector | None = ...,
metric: str | MetricFunction | Scorer | None = ...,
verbose: Verbose | None = ...,
method: Literal["predict_proba"] = ...,
fh: RowSelector | FHConstructor | None,
y: RowSelector | YSelector | None,
X: XSelector | None,
metric: str | MetricFunction | Scorer | None,
verbose: Verbose | None,
method: Literal["predict_proba"],
**kwargs,
) -> Normal: ...

@overload
def _prediction(
self,
fh: RowSelector | FHConstructor | None = ...,
y: RowSelector | YSelector | None = ...,
X: XSelector | None = ...,
metric: str | MetricFunction | Scorer | None = ...,
verbose: Verbose | None = ...,
method: Literal[
"predict",
"predict_interval",
"predict_quantiles",
"predict_residuals",
"predict_var",
] = ...,
fh: RowSelector | FHConstructor | None,
y: RowSelector | YSelector | None,
X: XSelector | None,
metric: str | MetricFunction | Scorer | None,
verbose: Verbose | None,
method: Literal["score"],
**kwargs,
) -> Pandas: ...
) -> Float: ...

def _prediction(
self,
Expand All @@ -3021,7 +3025,7 @@ def _prediction(
The [forecasting horizon][row-and-column-selection] encoding
the time stamps to forecast at.
y: int, str, dict, sequence, dataframe-like or None, default=None
y: int, str, sequence, dataframe-like or None, default=None
Ground truth observations.
X: hashable, segment, sequence, dataframe-like or None, default=None
Expand Down Expand Up @@ -3299,7 +3303,7 @@ def predict_residuals(
Parameters
----------
y: int, str, dict, sequence or dataframe
y: int, str, sequence or dataframe
Ground truth observations.
X: hashable, segment, sequence, dataframe-like or None, default=None
Expand Down Expand Up @@ -3397,7 +3401,7 @@ def score(
Parameters
----------
y: int, str, dict, sequence or dataframe-like
y: int, str, sequence or dataframe-like
Ground truth observations.
X: hashable, segment, sequence, dataframe-like or None, default=None
Expand Down
8 changes: 4 additions & 4 deletions atom/basetransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,12 @@ def backend(self, value: Backend):

elif value == "dask":
check_dependency("dask")
import dask
from dask.distributed import Client

try:
dask.distributed.Client.current()
Client.current()
except ValueError:
dask.distributed.Client(processes=False)
Client(processes=False)

joblib.parallel_config(backend=value)

Expand Down Expand Up @@ -369,7 +369,7 @@ def _device_id(self) -> int:
@overload
def _check_input(
X: XSelector,
y: Literal[None] = ...,
y: Literal[None],
*,
columns: Axes | None = ...,
name: str | Axes | None = ...,
Expand Down
16 changes: 9 additions & 7 deletions atom/data/branch.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,19 +428,19 @@ def shape(self) -> tuple[Int, Int]:
return self.dataset.shape

@property
def columns(self) -> pd.Index:
def columns(self) -> list[str]:
"""Name of all the columns."""
return self.dataset.columns
return list(self.dataset.columns)

@property
def n_columns(self) -> int:
"""Number of columns."""
return len(self.columns)

@property
def features(self) -> pd.Index:
def features(self) -> list[str]:
"""Name of the features."""
return self.columns[:-self._data.n_targets]
return list(self.columns[:-self._data.n_targets])

@property
def n_features(self) -> int:
Expand All @@ -460,7 +460,7 @@ def _all(self) -> pd.DataFrame:
calculation.
"""
return pd.concat([self.dataset, self.holdout]) # type: ignore[list-item]
return pd.concat([self.dataset, self.holdout])

# Utility methods ============================================== >>

Expand Down Expand Up @@ -580,10 +580,12 @@ def _get_rows(
# If rows were excluded with `!`, select all but those
inc = list(_all.index[~_all.index.isin(exc)])

rows_c = _all.loc[inc]

if return_X_y:
return _all.loc[inc, self.features], _all.loc[inc, self.target] # type: ignore[index]
return rows_c[self.features], rows_c[self.target]
else:
return self._all.loc[inc]
return rows_c

def _get_columns(
self,
Expand Down
Loading

0 comments on commit a59a3b5

Please sign in to comment.