From 83f318af3a2ffd4c0dc6a7c95e98a6efba8e20cc Mon Sep 17 00:00:00 2001 From: Mavs Date: Sat, 2 Mar 2024 17:32:42 +0100 Subject: [PATCH] fix mypy all --- .pre-commit-config.yaml | 4 +- atom/basemodel.py | 165 +++++------ atom/models/custom.py | 2 +- atom/models/ensembles.py | 12 +- atom/plots/predictionplot.py | 59 ++-- atom/utils/patches.py | 327 +-------------------- atom/utils/types.py | 2 +- atom/utils/utils.py | 82 ++---- docs_sources/dependencies.md | 2 +- docs_sources/user_guide/data_management.md | 3 +- pyproject.toml | 8 +- tests/test_atom.py | 2 +- tests/test_basemodel.py | 19 +- tests/test_basetransformer.py | 6 + tests/test_plots.py | 19 +- tests/test_utils.py | 105 +------ 16 files changed, 176 insertions(+), 641 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6a568670a..aaa63ec8d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: files: ^atom/.*\.py$|tests/.*\.py$ - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.2.2 + rev: v0.3.0 hooks: - id: ruff types_or: [ python, pyi, jupyter ] @@ -17,5 +17,5 @@ repos: rev: v1.8.0 hooks: - id: mypy - additional_dependencies: [pip==23.3.1, types-requests, pandas-stubs, beartype] + additional_dependencies: [pip==23.3.1, types-requests, beartype, git+https://github.com/pandas-dev/pandas-stubs.git] files: ^atom/.*\.py$|tests/.*\.py$ diff --git a/atom/basemodel.py b/atom/basemodel.py index 3855912d7..8e2b8e003 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -63,11 +63,11 @@ from atom.utils.constants import DF_ATTRS from atom.utils.patches import fit_and_score from atom.utils.types import ( - HT, Backend, Bool, Engine, FHConstructor, Float, FloatZeroToOneExc, Int, + HT, Backend, Bool, Engine, Float, FloatZeroToOneExc, Int, IntLargerEqualZero, MetricConstructor, MetricFunction, NJobs, Pandas, PredictionMethods, PredictionMethodsTS, Predictor, RowSelector, Scalar, Scorer, Sequence, Stages, TargetSelector, Verbose, Warnings, XReturn, - XSelector, YReturn, YSelector, float_t, int_t, + XSelector, YConstructor, YReturn, YSelector, float_t, int_t, ) from atom.utils.utils import ( ClassMap, DataConfig, Goal, PlotCallback, ShapExplanation, Task, @@ -265,13 +265,8 @@ def __init__( # Inject goal-specific methods from ForecastModel if goal is Goal.forecast and ClassRegModel in self.__class__.__bases__: for n, m in vars(ForecastModel).items(): - if not n.startswith("__"): - try: - setattr(self, n, m.__get__(self, ForecastModel)) - except AttributeError: - # available_if descriptor raises an error - # if the estimator doesn't have the method - pass + if hasattr(m, "__get__"): + setattr(self, n, m.__get__(self, ForecastModel)) # Skip this part if only initialized for the estimator if branches: @@ -1937,10 +1932,13 @@ def create_dashboard( self._log("Creating dashboard...", 1) - X, y = self.branch._get_rows(rows, return_X_y=True) + Xt, yt = self.branch._get_rows(rows, return_X_y=True) + + if self.scaler: + Xt = cast(pd.DataFrame, self.scaler.transform(Xt)) # Get shap values from the internal ShapExplanation object - exp = self._shap.get_explanation(X, target=(0,)) + exp = self._shap.get_explanation(Xt, target=(0,)) # Explainerdashboard requires all the target classes if self.task.is_classification: @@ -1954,9 +1952,9 @@ def create_dashboard( params = {"permutation_metric": self._metric, "n_jobs": self.n_jobs} if self.task.is_classification: - explainer = ClassifierExplainer(self.estimator, X, y, **params) + explainer = ClassifierExplainer(self.estimator, Xt, yt, **params) else: - explainer = RegressionExplainer(self.estimator, X, y, **params) + explainer = RegressionExplainer(self.estimator, Xt, yt, **params) explainer.set_shap_values(exp.base_values, exp.values) @@ -2465,8 +2463,7 @@ def transform( ---------- X: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, - `X` is ignored. If None, - `X` is ignored in the transformers. + `X` is ignored. If None, `X` is ignored in the transformers. y: int, str, sequence, dataframe-like or None, default=None Target column(s) corresponding to `X`. @@ -2639,10 +2636,7 @@ def get_transform_X_y( with adjust(self.pipeline, verbose=verbose) as pl: out = pl.transform(Xt, yt) - if isinstance(out, tuple): - return out - else: - return out, yt + return out if isinstance(out, tuple) else (out, yt) def assign_prediction_columns() -> list[str]: """Assign column names for the prediction methods. @@ -2971,7 +2965,7 @@ def get_tags(self) -> dict[str, Any]: @overload def _prediction( self, - fh: RowSelector | FHConstructor | None = ..., + fh: RowSelector | ForecastingHorizon | None = ..., y: RowSelector | YSelector | None = ..., X: XSelector | None = ..., metric: str | MetricFunction | Scorer | None = ..., @@ -2990,7 +2984,7 @@ def _prediction( @overload def _prediction( self, - fh: RowSelector | FHConstructor | None, + fh: RowSelector | ForecastingHorizon | None, y: RowSelector | YSelector | None, X: XSelector | None, metric: str | MetricFunction | Scorer | None, @@ -3001,7 +2995,7 @@ def _prediction( def _prediction( self, - fh: RowSelector | FHConstructor | None = None, + fh: RowSelector | ForecastingHorizon | None = None, y: RowSelector | YSelector | None = None, X: XSelector | None = None, metric: str | MetricFunction | Scorer | None = None, @@ -3011,9 +3005,10 @@ def _prediction( ) -> Float | Normal | Pandas: """Get predictions on new data or existing rows. - New data is first transformed through the model's pipeline. - Transformers that are only applied on the training set are - skipped. The model should implement the provided method. + If `fh` is not a [ForecastingHorizon][], it gets the rows from + the branch. If `fh` is a [ForecastingHorizon][] or not provided, + it converts `X` and `y` through the pipeline. The model should + implement the provided method. Parameters ---------- @@ -3050,61 +3045,45 @@ def _prediction( called. """ + Xt: pd.DataFrame | None + yt: Pandas | None - def get_transform_X_y( - X: XSelector | None, - y: YSelector | None, - ) -> tuple[pd.DataFrame, Pandas | None]: - """Get X and y from the pipeline transformation. - - Parameters - ---------- - X: dataframe-like or None - Feature set. + if not isinstance(fh, ForecastingHorizon | None): + Xt, yt = self.branch._get_rows(fh, return_X_y=True) - y: int, str, sequence, dataframe-like or None - Target column(s) corresponding to `X`. - - Returns - ------- - dataframe - Transformed feature set. - - series, dataframe or None - Transformed target column. - - """ - Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) - - with adjust(self.pipeline, verbose=verbose) as pl: - out = pl.transform(Xt, yt) + if self.scaler: + Xt = cast(pd.DataFrame, self.scaler.transform(Xt)) - if isinstance(out, tuple): - return out - elif X is not None: - return out, yt - else: - return Xt, out + fh = ForecastingHorizon(Xt.index, is_relative=False) - if y is not None: + elif y is not None: try: Xt, yt = self.branch._get_rows(y, return_X_y=True) # type: ignore[call-overload] - if self.scaler and not Xt.empty: + if self.scaler and Xt is not None: Xt = cast(pd.DataFrame, self.scaler.transform(Xt)) except Exception: # noqa: BLE001 - Xt, yt = get_transform_X_y(X, y) + Xt, yt = self._check_input(X, y, columns=self.og.features, name=self.og.target) # type: ignore[arg-type] + + with adjust(self.pipeline, verbose=verbose) as pl: + out = pl.transform(Xt, yt) + + Xt, yt = out if isinstance(out, tuple) else (Xt, out) + + elif X is not None: + Xt, _ = self._check_input(X, columns=self.og.features, name=self.og.target) # type: ignore[call-overload, arg-type] + + with adjust(self.pipeline, verbose=verbose) as pl: + Xt = pl.transform(Xt) else: - Xt, yt = get_transform_X_y(X, y) + Xt, yt = X, y if method != "score": if "y" in sign(func := getattr(self.estimator, method)): return self.memory.cache(func)(y=yt, X=check_empty(Xt), **kwargs) else: - if fh is not None and not isinstance(fh, ForecastingHorizon): - fh = self.branch._get_rows(fh).index return self.memory.cache(func)(fh=fh, X=check_empty(Xt), **kwargs) else: if metric is None: @@ -3112,13 +3091,12 @@ def get_transform_X_y( else: scorer = get_custom_scorer(metric) - return self._score_from_est(scorer, self.estimator, Xt, yt, **kwargs) + return self._score_from_est(scorer, self.estimator, Xt, yt, **kwargs) # type: ignore[arg-type] - @available_if(estimator_has_attr("predict")) @composed(crash, method_to_log, beartype) def predict( self, - fh: RowSelector | FHConstructor, + fh: RowSelector | ForecastingHorizon, X: XSelector | None = None, *, inverse: Bool = True, @@ -3165,11 +3143,10 @@ def predict( else: return self._convert(pred) - @available_if(estimator_has_attr("predict_interval")) @composed(crash, method_to_log, beartype) def predict_interval( self, - fh: RowSelector | FHConstructor, + fh: RowSelector | ForecastingHorizon, X: XSelector | None = None, *, coverage: Float | Sequence[Float] = 0.9, @@ -3242,11 +3219,10 @@ def predict_interval( else: return self._convert(pred) - @available_if(estimator_has_attr("predict_proba")) @composed(crash, method_to_log, beartype) def predict_proba( self, - fh: RowSelector | FHConstructor, + fh: RowSelector | ForecastingHorizon, X: XSelector | None = None, *, marginal: Bool = True, @@ -3290,11 +3266,10 @@ def predict_proba( method="predict_proba", ) - @available_if(estimator_has_attr("predict_quantiles")) @composed(crash, method_to_log, beartype) def predict_quantiles( self, - fh: RowSelector | FHConstructor, + fh: RowSelector | ForecastingHorizon, X: XSelector | None = None, *, alpha: Float | Sequence[Float] = (0.05, 0.95), @@ -3341,11 +3316,10 @@ def predict_quantiles( ) ) - @available_if(estimator_has_attr("predict_residuals")) @composed(crash, method_to_log, beartype) def predict_residuals( self, - y: RowSelector | YSelector, + y: RowSelector | YConstructor, X: XSelector | None = None, *, verbose: Verbose | None = None, @@ -3360,11 +3334,13 @@ def predict_residuals( Parameters ---------- - y: int, str, sequence or dataframe-like - Ground truth observations. + y: hashable, segment, sequence or dataframe-like + [Selection of rows][row-and-column-selection] or ground + truth observations. - X: hashable, segment, sequence, dataframe-like or None, default=None - Exogenous time series corresponding to `y`. + X: dataframe-like or None, default=None + Exogenous time series corresponding to `y`. This parameter + is ignored outif `y` is a selection of rows in the dataset. verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If @@ -3381,11 +3357,10 @@ def predict_residuals( self._prediction(y=y, X=X, verbose=verbose, method="predict_residuals") ) - @available_if(estimator_has_attr("predict_var")) @composed(crash, method_to_log, beartype) def predict_var( self, - fh: RowSelector | FHConstructor, + fh: RowSelector | ForecastingHorizon, X: XSelector | None = None, *, cov: Bool = False, @@ -3432,13 +3407,12 @@ def predict_var( ) ) - @available_if(estimator_has_attr("score")) @composed(crash, method_to_log, beartype) def score( self, y: RowSelector | YSelector, X: XSelector | None = None, - fh: RowSelector | FHConstructor | None = None, + fh: RowSelector | ForecastingHorizon | None = None, *, metric: str | MetricFunction | Scorer | None = None, verbose: Verbose | None = None, @@ -3453,20 +3427,22 @@ def score( !!! info If the `metric` parameter is left to its default value, the - method returns atom's metric score, not the metric returned - by sktime's score method for estimators. + method returns atom's metric score, not the metric used by + sktime's score method for estimators. Parameters ---------- y: int, str, sequence or dataframe-like - Ground truth observations. + [Selection of rows][row-and-column-selection] or ground + truth observations. - X: hashable, segment, sequence, dataframe-like or None, default=None - Exogenous time series corresponding to `fh`. + X: dataframe-like or None, default=None + Exogenous time series corresponding to `fh`. This parameter + is ignored if `y` is a selection of rows in the dataset. fh: hashable, segment, sequence, dataframe, [ForecastingHorizon][] or None, default=None - The [forecasting horizon][row-and-column-selection] encoding - the time stamps to forecast at. + Do nothing. The forecast horizon is taken from the index of + `y`. Implemented for continuity of sktime's API. metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, @@ -3481,14 +3457,7 @@ def score( Returns ------- float - Metric score of y with respect to a ground truth. + Metric score of `y` with respect to a ground truth. """ - return self._prediction( - y=y, - X=X, - fh=fh, - metric=metric, - verbose=verbose, - method="score", - ) + return self._prediction(fh=None, y=y, X=X, metric=metric, verbose=verbose, method="score") diff --git a/atom/models/custom.py b/atom/models/custom.py index d013de331..8a25aadcf 100644 --- a/atom/models/custom.py +++ b/atom/models/custom.py @@ -33,7 +33,7 @@ def create_custom_model(estimator: Predictor, **kwargs) -> BaseModel: """ base = ForecastModel if kwargs["goal"] is Goal.forecast else ClassRegModel - class CustomModel(base): # type: ignore[valid-type, misc] + class CustomModel(base): # type: ignore[valid-type] """Model with estimator provided by user.""" def __init__(self, **kwargs): diff --git a/atom/models/ensembles.py b/atom/models/ensembles.py index b3de2ae07..37f0bb6ca 100644 --- a/atom/models/ensembles.py +++ b/atom/models/ensembles.py @@ -32,7 +32,7 @@ def create_stacking_model(**kwargs) -> BaseModel: """ base = ForecastModel if kwargs["goal"] is Goal.forecast else ClassRegModel - class Stacking(base): # type: ignore[valid-type, misc] + class Stacking(base): # type: ignore[valid-type] """Stacking ensemble. Parameters @@ -57,7 +57,7 @@ class Stacking(base): # type: ignore[valid-type, misc] _estimators: ClassVar[dict[str, str]] = { "classification": "sklearn.ensemble.StackingClassifier", "regression": "sklearn.ensemble.StackingRegressor", - "forecast": "atom.utils.patches.StackingForecaster", + "forecast": "sktime.forecasting.compose.StackingForecaster", } def __init__(self, models: list[Model], **kwargs): @@ -120,7 +120,7 @@ def create_voting_model(**kwargs) -> BaseModel: """ base = ForecastModel if kwargs["goal"] is Goal.forecast else ClassRegModel - class Voting(base): # type: ignore[valid-type, misc] + class Voting(base): # type: ignore[valid-type] """Voting ensemble. Parameters @@ -143,9 +143,9 @@ class Voting(base): # type: ignore[valid-type, misc] supports_engines = ("sklearn",) _estimators: ClassVar[dict[str, str]] = { - "classification": "atom.utils.patches.VotingClassifier", - "regression": "atom.utils.patches.VotingRegressor", - "forecast": "atom.utils.patches.EnsembleForecaster", + "classification": "sklearn.ensemble.VotingClassifier", + "regression": "sklearn.ensemble.VotingRegressor", + "forecast": "sktime.forecasting.compose.EnsembleForecaster", } def __init__(self, models: list[Model], **kwargs): diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py index eebd6d475..7f407970a 100644 --- a/atom/plots/predictionplot.py +++ b/atom/plots/predictionplot.py @@ -42,8 +42,8 @@ TargetsSelector, XConstructor, int_t, ) from atom.utils.utils import ( - Task, check_canvas, check_dependency, check_empty, check_predict_proba, - crash, divide, get_custom_scorer, has_task, lst, rnd, + Task, adjust, check_canvas, check_dependency, check_predict_proba, crash, + divide, get_custom_scorer, has_task, lst, rnd, ) @@ -1230,23 +1230,14 @@ def plot_forecast( models_c = self._get_plot_models(models) target_c = self.branch._get_target(target, only_columns=True) - if not isinstance(fh, ForecastingHorizon): - fh = self.branch._get_rows(fh).index - fig = self._get_figure() xaxis, yaxis = BasePlot._fig.get_axes(y=(0.31, 1.0)) xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.0, 0.29)) for m in models_c: - if X is not None: - Xt = m.transform(X) - elif isinstance(fh, pd.Index): - Xt = m.branch._all.loc[fh] - else: - Xt = X - # Draw predictions and interval - y_pred = m.predict(fh=fh, X=check_empty(Xt), inverse=inverse) + with adjust(m, transform="pandas"): + y_pred = m.predict(fh=fh, X=X, inverse=inverse) if self.task.is_multioutput: y_pred = y_pred[target_c] @@ -1255,10 +1246,16 @@ def plot_forecast( idx = y_pred.index.intersection(m.branch.train.index) y_pred.loc[idx] = np.nan # type: ignore[call-overload] + if y_pred.isna().all(): + raise ValueError( + "Invalid value for the plot_insample parameter. plot_insample " + "must be True when all predicted values are in the training set." + ) + if inverse: - y_true = m.og._all.loc[y_pred.index, target_c] + y_true = m.og._all[target_c] else: - y_true = m.branch._all.loc[y_pred.index, target_c] + y_true = m.branch._all[target_c] self._draw_line( x=(x := self._get_plot_index(y_pred)), @@ -1270,21 +1267,23 @@ def plot_forecast( yaxis=yaxis, ) - # Draw residuals - self._draw_line( - x=x, - y=np.subtract(y_true, y_pred), - mode="lines+markers", - parent=m.name, - legend=legend, - showlegend=False, - xaxis=xaxis2, - yaxis=yaxis2, - ) + # Draw residuals if fh within range of y_true + if not (idx := y_pred.index.intersection(y_true.index)).empty: + self._draw_line( + x=x, + y=np.subtract(y_true.loc[idx], y_pred.loc[idx]), + mode="lines+markers", + parent=m.name, + legend=legend, + showlegend=False, + xaxis=xaxis2, + yaxis=yaxis2, + ) if plot_interval: try: - y_interval = m.predict_interval(fh=fh, X=Xt, inverse=inverse) + with adjust(m, transform="pandas"): + y_interval = m.predict_interval(fh=fh, X=X, inverse=inverse) except (AttributeError, NotImplementedError): continue # Fails for some models like ES @@ -1328,7 +1327,7 @@ def plot_forecast( # Draw original time series fig.add_scatter( - x=x, + x=self._get_plot_index(y_true), y=y_true, name=target_c, mode="lines+markers", @@ -2941,7 +2940,9 @@ def plot_probabilities( for m in models_c: X, y_true = m.branch._get_rows(rows, return_X_y=True) - y_pred = m.predict_proba(X.index) + + with adjust(m, transform="pandas"): + y_pred = m.predict_proba(X.index) for v in np.unique(m.dataset[col]): # Get indices per class diff --git a/atom/utils/patches.py b/atom/utils/patches.py index bc7b2bc2b..c1062e7c1 100644 --- a/atom/utils/patches.py +++ b/atom/utils/patches.py @@ -8,32 +8,14 @@ from __future__ import annotations from collections.abc import Callable -from copy import deepcopy from typing import Any from unittest.mock import patch -import numpy as np -import pandas as pd -from joblib import Parallel, delayed -from sklearn.base import clone -from sklearn.ensemble import VotingClassifier as VC -from sklearn.ensemble import VotingRegressor as VR -from sklearn.ensemble._base import _fit_single_estimator from sklearn.model_selection._validation import _fit_and_score, _score -from sklearn.utils import Bunch -from sklearn.utils.multiclass import check_classification_targets -from sktime.forecasting.compose import EnsembleForecaster as EF -from sktime.forecasting.compose import StackingForecaster as SF -from typing_extensions import Self -from atom.utils.types import ( - Bool, Float, Int, Predictor, Scalar, Sequence, XSelector, -) -from atom.utils.utils import check_is_fitted +from atom.utils.types import Float -# Functions ======================================================== >> - def fit_and_score(*args, **kwargs) -> dict[str, Any]: """Wrap sklearn's _fit_and_score function. @@ -67,310 +49,3 @@ def wrapper(*args, **kwargs) -> Float | dict[str, Float]: return f(args_c[0][-1], *args_c[1:], **kwargs) return wrapper - - -# Ensembles ======================================================== >> - -class BaseVoting: - """Base class for the patched voting estimators.""" - - def _get_fitted_attrs(self): - """Update the fit attributes (end with underscore).""" - self.named_estimators_ = Bunch() - - # Uses 'drop' as placeholder for dropped estimators - est_iter = iter(self.estimators_) - for name, est in self.estimators: - if est == "drop" or check_is_fitted(est, exception=False): - self.named_estimators_[name] = est - else: - self.named_estimators_[name] = next(est_iter) - - if hasattr(est, "feature_names_in_"): - self.feature_names_in_ = est.feature_names_in_ - - def fit( - self, - X: XSelector, - y: Sequence[Any], - sample_weight: Sequence[Scalar] | None = None, - ) -> Self: - """Fit the estimators in the ensemble. - - Largely same code as sklearn's implementation with one major - difference: estimators that are already fitted are skipped. - - Parameters - ---------- - X: dataframe-like - Feature set with shape (n_samples, n_features) - - y: sequence - Target column. - - sample_weight: sequence or None, default=None - Sample weights. If None, then samples are equally weighted. - Note that this is supported only if all underlying estimators - support sample weights. - - Returns - ------- - self - Estimator instance. - - """ - names, all_estimators = self._validate_estimators() - - # Difference with sklearn's implementation, skip fitted estimators - estimators = Parallel(n_jobs=self.n_jobs)( - delayed(_fit_single_estimator)( - clone(clf), - X, - y, - sample_weight=sample_weight, - message_clsname="Voting", - message=self._log_message(names[idx], idx + 1, len(all_estimators)), - ) - for idx, clf in enumerate(all_estimators) - if clf != "drop" and not check_is_fitted(clf, exception=False) - ) - - self.estimators_ = [] - estimators = iter(estimators) - for est in self.estimators: - if est[1] != "drop": - if check_is_fitted(est[1], exception=False): - self.estimators_.append(est[1]) - else: - self.estimators_.append(next(estimators)) - - self._get_fitted_attrs() # Update the fit attrs - - return self - - -class VotingClassifier(BaseVoting, VC): - """Soft Voting/Majority Rule classifier. - - Modified version of sklearn's VotingClassifier. The differences - are: - - - Doesn't fit estimators if they're already fitted. - - Is considered fitted when all estimators are. - - Doesn't implement a LabelEncoder to encode the target column. - - See sklearn's [VotingClassifier][] for a description of the - parameters and attributes. - - """ - - __module__ = VC.__module__ - __name__ = VC.__name__ - __qualname__ = VC.__qualname__ - __doc__ = VC.__doc__ - __annotations__ = VC.__annotations__ - - def __init__( - self, - estimators: list[tuple[str, Predictor]], - *, - voting: str = "hard", - weights: Sequence[Scalar] | None = None, - n_jobs: Int | None = None, - flatten_transform: Bool = True, - verbose: Bool = False, - ): - super().__init__( - estimators=estimators, - voting=voting, - weights=weights, - n_jobs=n_jobs, - flatten_transform=flatten_transform, - verbose=verbose, - ) - - # If all estimators are prefit, create fitted attrs - if all( - est[1] == "drop" or check_is_fitted(est[1], exception=False) - for est in self.estimators - ): - self.estimators_ = [est[1] for est in self.estimators if est[1] != "drop"] - self._get_fitted_attrs() - - def fit( - self, - X: XSelector, - y: Sequence[Any], - sample_weight: Sequence[Scalar] | None = None, - ) -> Self: - """Fit the estimators, skipping prefit ones. - - Parameters - ---------- - X: dataframe-like - Feature set with shape=(n_samples, n_features). - - y: sequence - Target column. - - sample_weight: sequence or None, default=None - Sample weights. If None, then samples are equally weighted. - Note that this is supported only if all underlying estimators - support sample weights. - - Returns - ------- - self - Estimator instance. - - """ - check_classification_targets(y) - if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: - raise NotImplementedError( - "Multilabel and multioutput classification is not supported." - ) - - if self.voting not in ("soft", "hard"): - raise ValueError(f"Voting must be 'soft' or 'hard', got (voting={self.voting}).") - - if self.weights is not None and len(self.weights) != len(self.estimators): - raise ValueError( - "Number of estimators and weights must be equal, got " - f"{len(self.weights)} weights, {len(self.estimators)} estimators." - ) - - return super().fit(X, y, sample_weight) - - def predict(self, X: XSelector) -> np.ndarray: - """Predict class labels for X. - - Parameters - ---------- - X: dataframe-like - Feature set with shape=(n_samples, n_features). - - Returns - ------- - np.array - Predicted class labels. - - """ - check_is_fitted(self) - if self.voting == "soft": - return np.argmax(self.predict_proba(X), axis=1) - else: - return np.apply_along_axis( - lambda x: np.argmax(np.bincount(x, weights=self._weights_not_none)), - axis=1, - arr=self._predict(X), - ) - - -class VotingRegressor(BaseVoting, VR): - """Soft Voting/Majority Rule regressor. - - Modified version of sklearn's VotingRegressor. Differences are: - - - Doesn't fit estimators if they're already fitted. - - Is considered fitted when all estimators are. - - See sklearn's [VotingRegressor][] for a description of the - parameters and attributes. - - """ - - __module__ = VR.__module__ - __name__ = VR.__name__ - __qualname__ = VR.__qualname__ - __doc__ = VR.__doc__ - __annotations__ = VR.__annotations__ - - def __init__( - self, - estimators: list[tuple[str, Predictor]], - *, - weights: Sequence[Scalar] | None = None, - n_jobs: Int | None = None, - verbose: Bool = False, - ): - super().__init__( - estimators, - weights=weights, - n_jobs=n_jobs, - verbose=verbose, - ) - - # If all estimators are prefit, create fitted attrs - if all( - est[1] == "drop" or check_is_fitted(est[1], exception=False) for est in self.estimators - ): - self.estimators_ = [est[1] for est in self.estimators if est[1] != "drop"] - self._get_fitted_attrs() - - -class BaseForecaster: - """Base class for the patched ensemble forecasters.""" - - def _fit_forecasters(self, forecasters, y, X, fh): - """Fit all forecasters in parallel. - - Patched to skip already fitted forecasters from refitting. - - """ - if all(check_is_fitted(fc, exception=False) for fc in forecasters): - self.forecasters_ = [deepcopy(fc) for fc in forecasters] - else: - self.forecasters_ = Parallel(n_jobs=self.n_jobs)( - delayed(lambda fc, y, X, fh: fc.fit(y, X, fh))(fc.clone(), y, X, fh) - for fc in forecasters - ) - - def _predict_forecasters(self, fh=None, X=None): - """Collect results from forecaster.predict() calls. - - Patched to convert all prediction to pd.DataFrame, which is - done normally during fit(). If absent, the prediction fails - when trying to get a multilevel index. - - """ - return [pd.DataFrame(forecaster.predict(fh=fh, X=X)) for forecaster in self.forecasters_] - - -class EnsembleForecaster(BaseForecaster, EF): - """Ensemble of voting forecasters. - - Modified version of sktime's EnsembleForecaster. The differences - are: - - - Doesn't fit estimators if they're already fitted. - - See sktime's [EnsembleForecaster][] for a description of the - parameters and attributes. - - """ - - __module__ = EF.__module__ - __name__ = EF.__name__ - __qualname__ = EF.__qualname__ - __doc__ = EF.__doc__ - __annotations__ = EF.__annotations__ - - -class StackingForecaster(BaseForecaster, SF): - """Ensemble of stacking forecasters. - - Modified version of sktime's StackingForecaster. The differences - are: - - - Doesn't fit estimators if they're already fitted. - - See sktime's [StackingForecaster][] for a description of the - parameters and attributes. - - """ - - __module__ = SF.__module__ - __name__ = SF.__name__ - __qualname__ = SF.__qualname__ - __doc__ = SF.__doc__ - __annotations__ = SF.__annotations__ diff --git a/atom/utils/types.py b/atom/utils/types.py index d6f0ff271..771275030 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -267,7 +267,7 @@ def predict(self, *args, **kwargs) -> Pandas: ... XSelector: TypeAlias = XConstructor | Callable[..., XConstructor] YConstructor: TypeAlias = Sequence[Any] | XConstructor YSelector: TypeAlias = Int | str | YConstructor -FHConstructor: TypeAlias = Int | Sequence[Int] | ForecastingHorizon +FHConstructor: TypeAlias = Hashable | Sequence[Hashable] | ForecastingHorizon # Selection of rows or columns by name or position ColumnSelector: TypeAlias = Int | str | Segment | Sequence[Int | str] | pd.DataFrame diff --git a/atom/utils/utils.py b/atom/utils/utils.py index ae47ab58a..ff2ea865a 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -33,6 +33,7 @@ from beartype.door import is_bearable from IPython.display import display from matplotlib.colors import to_rgba +from mlflow.models.signature import infer_signature from pandas._libs.missing import NAType from pandas._typing import Axes, Dtype from pandas.api.types import is_numeric_dtype @@ -44,7 +45,7 @@ matthews_corrcoef, ) from sklearn.utils import _print_elapsed_time -from sklearn.utils.validation import _is_fitted +from sklearn.utils.validation import _check_response_method, _is_fitted from atom.utils.constants import CAT_TYPES, __version__ from atom.utils.types import ( @@ -654,25 +655,24 @@ def __call__(self, study: Study, trial: FrozenTrial): for met in self.T._metric.keys(): mlflow.log_metric(f"{met}_validation", trial_info[met]) - if estimator := trial_info["estimator"]: - # Mlflow only accepts params with char length <=250 - mlflow.log_params( - {k: v for k, v in estimator.get_params().items() if len(str(v)) <= 250} - ) + # Mlflow only accepts params with char length <=250 + mlflow.log_params( + { + k: v + for k, v in trial_info["estimator"].get_params().items() + if len(str(v)) <= 250 + } + ) - mlflow.sklearn.log_model( - sk_model=estimator, - artifact_path=estimator.__class__.__name__, - signature=mlflow.models.signature.infer_signature( - model_input=pd.DataFrame(self.T.branch.X), - model_output=estimator.predict(self.T.branch.X.iloc[[0]]), - ), - input_example=pd.DataFrame(self.T.branch.X.iloc[[0], :]), - ) - else: - mlflow.log_params( - {k: v for k, v in trial.params.items() if len(str(v)) <= 250} - ) + mlflow.sklearn.log_model( + sk_model=(est := trial_info["estimator"]), + artifact_path=est.__class__.__name__, + signature=infer_signature( + model_input=self.T.branch.X, + model_output=est.predict(self.T.branch.X.iloc[[0]]), + ), + input_example=self.T.branch.X.iloc[[0], :], + ) if self.n_jobs == 1: # Print overview of trials @@ -898,23 +898,6 @@ def __init__( self._shap_values = pd.Series(dtype="object") self._interaction_values = pd.Series(dtype="object") - @property - def attr(self) -> str: - """Get the model's main prediction method. - - Returns - ------- - str - Name of the prediction method. - - """ - if hasattr(self.estimator, "predict_proba"): - return "predict_proba" - elif hasattr(self.estimator, "decision_function"): - return "decision_function" - else: - return "predict" - @cached_property def explainer(self) -> Explainer: """Get shap's explainer. @@ -935,7 +918,8 @@ def explainer(self) -> Explainer: return Explainer(self.estimator, **kwargs) except TypeError: # If a method is provided as first arg, selects always Permutation - return Explainer(getattr(self.estimator, self.attr), **kwargs) + responses = ("predict_proba", "decision_function", "predict") + return Explainer(_check_response_method(self.estimator, responses), **kwargs) def get_explanation( self, @@ -1852,7 +1836,7 @@ def to_df( data_c = pd.DataFrame.sparse.from_spmatrix(data, index, columns) else: data_c = pd.DataFrame( - data=data, # type: ignore[misc, arg-type] + data=data, # type: ignore[arg-type] index=index, columns=columns, copy=True, @@ -1995,12 +1979,12 @@ def to_tabular( """ if (n_targets := n_cols(data)) == 1: - return to_series(data, index=index, name=flt(columns)) # type: ignore[misc, arg-type] + return to_series(data, index=index, name=flt(columns)) # type: ignore[arg-type] else: - if columns is None and not hasattr(data, "__dataframe__"): + if columns is None and not isinstance(data, dict) and not hasattr(data, "__dataframe__"): columns = [f"y{i}" for i in range(n_targets)] - return to_df(data, index=index, columns=columns) # type: ignore[misc, arg-type] + return to_df(data, index=index, columns=columns) # type: ignore[arg-type] def check_is_fitted( @@ -2013,8 +1997,9 @@ def check_is_fitted( Checks if the estimator is fitted by verifying the presence of fitted attributes (not None or empty). Otherwise, it raises a - NotFittedError. Extension on sklearn's function that accounts - for empty dataframes and series and returns a boolean. + NotFittedError. Wraps sklearn's function but doesn't check for + the presence of the `fit` method and can return a boolean instead + of always raising an exception. Parameters ---------- @@ -2036,12 +2021,7 @@ def check_is_fitted( Whether the estimator is fitted. """ - if hasattr(obj, "_is_fitted"): - is_fitted = obj._is_fitted - else: - is_fitted = _is_fitted(obj, attributes) - - if not is_fitted: + if not _is_fitted(obj, attributes): if exception: raise NotFittedError( f"This {type(obj).__name__} instance is not yet fitted. " @@ -2359,8 +2339,6 @@ def fit_one( if len(kwargs) == 0: if y is not None and hasattr(estimator, "_cols"): kwargs["X"] = to_df(y)[inc] - elif params["X"].default != Parameter.empty: - kwargs["X"] = params["X"].default # Fill X with default elif X is None: raise ValueError( "Exception while trying to fit transformer " @@ -2616,7 +2594,7 @@ def estimator_has_attr(attr: str) -> Callable: def check(model: BaseModel) -> bool: # Raise original `AttributeError` if `attr` does not exist - getattr(model._est_class, attr) + getattr(model.estimator, attr) return True return check diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md index 216bfd6ab..b6d6c85db 100644 --- a/docs_sources/dependencies.md +++ b/docs_sources/dependencies.md @@ -90,7 +90,7 @@ running `pdm install --dev` (remember to install [pdm](https://pdm-project.org/l * **[mypy](https://www.mypy-lang.org/)** (>=1.8.0) * **[pandas_stubs](https://pypi.org/project/pandas-stubs/)** (>=2.2.0.240218) * **[pre-commit](https://pre-commit.com/)** (>=3.5.0) -* **[ruff](https://docs.astral.sh/ruff/)** (>=0.2.2) +* **[ruff](https://docs.astral.sh/ruff/)** (>=0.3.0) **Testing** diff --git a/docs_sources/user_guide/data_management.md b/docs_sources/user_guide/data_management.md index 590d881bf..422ff31a1 100644 --- a/docs_sources/user_guide/data_management.md +++ b/docs_sources/user_guide/data_management.md @@ -408,8 +408,7 @@ as the index of the row selection. Note that, contrary to sktime's API but for consistency with the rest of ATOM's API, atom's fh starts with the training set, i.e., selecting `#!python atom.nf.predict(fh=range(5))` forecasts the first 5 rows of the training set, not the test set. To get the same result as sktime, use -`#!python atom.nf.predict(fh=range(len(atom.test), len(atom.test) + 5))` or -`#!python atom.nf.predict(fh=atom.test.index[:5])` instead. +a [ForecastingHorizon][] object, e.g., `#!python atom.nf.predict(fh=ForecastingHorizon(range(5)))`. !!! info diff --git a/pyproject.toml b/pyproject.toml index da54a8729..ee540f26b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,7 @@ dev = [ "mypy>=1.8.0", "pandas_stubs>=2.2.0.240218", "pre-commit>=3.5.0", - "ruff>=0.2.2", + "ruff>=0.3.0", # Testing "nbmake>=1.4.1", # To test example notebooks "pytest>=7.2.1", @@ -196,4 +196,10 @@ disable_error_code = [ "attr-defined", "abstract", # See https://github.com/python/mypy/issues/4717 "override", # Transformers' methods don't always match with that of TransformerMixin + "misc", +] + +[tool.coverage.report] +exclude_also = [ + "IF TYPE_CHECKING", ] diff --git a/tests/test_atom.py b/tests/test_atom.py index e6c8a7a99..0a546040b 100644 --- a/tests/test_atom.py +++ b/tests/test_atom.py @@ -85,7 +85,7 @@ def test_backend_with_n_jobs_1(): def test_init(): """Assert that the __init__ method works for non-standard parameters.""" - atom = ATOMClassifier(X_bin, y_bin, device="gpu", backend="multiprocessing") + atom = ATOMClassifier(X_bin, y_bin, n_jobs=2, device="gpu", backend="multiprocessing") assert atom.device == "gpu" assert atom.backend == "multiprocessing" diff --git a/tests/test_basemodel.py b/tests/test_basemodel.py index cd1be936b..c615b8844 100644 --- a/tests/test_basemodel.py +++ b/tests/test_basemodel.py @@ -702,7 +702,7 @@ def test_create_app(interface): def test_create_dashboard_multioutput(): """Assert that the method is unavailable for multioutput tasks.""" atom = ATOMClassifier(X_class, y=y_multiclass, random_state=1) - atom.run("Tree") + atom.run("LR") with pytest.raises(AttributeError, match=".*has no attribute.*"): atom.tree.create_dashboard() @@ -1036,18 +1036,19 @@ def test_forecast_get_tags(): def test_predictions_only_fh(): """Assert that predictions can be made using only the fh.""" atom = ATOMForecaster(y_fc, random_state=1) - atom.run("NF") + atom.run(["NF", "OLS"]) assert isinstance(atom.nf.predict(fh=range(10)), pd.Series) - assert isinstance(atom.nf.predict_interval(fh=ForecastingHorizon([1, 2])), pd.DataFrame) + assert isinstance(atom.ols.predict(fh=ForecastingHorizon([1, 2])), pd.Series) def test_predictions_with_exogenous(): """Assert that predictions can be made with exogenous variables.""" atom = ATOMForecaster(X_ex, y=y_ex, random_state=1) atom.run("NF") - assert isinstance(atom.nf.predict_proba(fh=range(10), X=X_ex.iloc[:10]), Normal) - assert isinstance(atom.nf.predict_quantiles(fh=range(10), X=X_ex.iloc[:10]), pd.DataFrame) - assert isinstance(atom.nf.predict_var(fh=range(10), X=X_ex.iloc[:10]), pd.DataFrame) + assert isinstance(atom.nf.predict(ForecastingHorizon(range(10)), X=X_ex.iloc[:10]), pd.Series) + assert isinstance(atom.nf.predict_proba(range(10), X=X_ex.iloc[:10]), Normal) + assert isinstance(atom.nf.predict_quantiles(range(10), X=X_ex.iloc[:10]), pd.DataFrame) + assert isinstance(atom.nf.predict_var(range(10), X=X_ex.iloc[:10]), pd.DataFrame) def test_ts_prediction_inverse_transform(): @@ -1063,9 +1064,9 @@ def test_ts_prediction_inverse_transform(): def test_predictions_with_y(): """Assert that predictions can be made with y.""" - atom = ATOMForecaster(X_ex, y=y_ex, random_state=1) - atom.run("NF") - assert isinstance(atom.nf.predict_residuals(y=y_ex[:10], X=X_ex.iloc[:10]), pd.Series) + atom = ATOMForecaster(y_fc[:-10], random_state=1) + atom.run("OLS") + assert isinstance(atom.ols.predict_residuals(y=y_fc[-10:]), pd.Series) def test_score_ts_metric_is_None(): diff --git a/tests/test_basetransformer.py b/tests/test_basetransformer.py index dee3334db..e60b69664 100644 --- a/tests/test_basetransformer.py +++ b/tests/test_basetransformer.py @@ -329,6 +329,12 @@ def test_target_is_int(): assert y.name == "mean radius" +def test_target_is_dict(): + """Assert that target column is assigned correctly for a dictionary.""" + _, y = BaseTransformer._check_input(X10, {"y1": y10, "y2": y10}) + assert list(y.columns) == ["y1", "y2"] + + def test_X_is_None_with_int(): """Assert that an error is raised when X is None and y is an int.""" with pytest.raises(ValueError, match=".*can't be None when y is an int.*"): diff --git a/tests/test_plots.py b/tests/test_plots.py index d96f2ef31..c06199851 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -15,7 +15,6 @@ from optuna.visualization._terminator_improvement import _ImprovementInfo from shap.plots._force import AdditiveForceVisualizer from sklearn.metrics import f1_score, get_scorer, mean_squared_error -from sktime.forecasting.base import ForecastingHorizon from atom import ATOMClassifier, ATOMForecaster, ATOMRegressor from atom.plots.baseplot import Aesthetics, BaseFigure @@ -566,7 +565,7 @@ def test_plot_confusion_matrix(): # For binary classification tasks atom = ATOMClassifier(X_bin, y_bin, random_state=1) atom.run(["RF", "LGB"], est_params={"n_estimators": 5}) - atom.plot_confusion_matrix(threshold=0.2, display=False) + atom.plot_confusion_matrix(rows=[0, 1, 2], threshold=0.2, display=False) # For multiclass classification tasks atom = ATOMClassifier(X_class, y_class, random_state=1) @@ -627,14 +626,18 @@ def test_plot_feature_importance(): def test_plot_forecast(): """Assert that the plot_forecast method works.""" - atom = ATOMForecaster(X_ex, y=(-2, -1), holdout_size=0.1, random_state=1) - atom.run(models=["NF", "ES"]) - atom.plot_forecast(inverse=False, display=False) - atom.plot_forecast(fh=atom.holdout.index, X=atom.holdout, display=False) - atom = ATOMForecaster(y_fc, random_state=1) atom.run(models="NF") - atom.plot_forecast(fh=ForecastingHorizon(range(3)), display=False) + + # All values are in train set when in_sample=False + with pytest.raises(ValueError, match=".*plot_insample parameter.*"): + atom.plot_forecast(fh=range(3), plot_insample=False, display=False) + + atom.plot_forecast(inverse=False, display=False) + + atom = ATOMForecaster(X_ex, y=(-2, -1), random_state=1) + atom.run(models=["NF", "ES"]) + atom.plot_forecast(display=False) def test_plot_gains(): diff --git a/tests/test_utils.py b/tests/test_utils.py index 12f107676..893dfbe7f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,55 +8,10 @@ from datetime import timedelta from unittest.mock import patch -import numpy as np import pytest -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis -from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor -from sklearn.linear_model import LinearRegression -from sklearn.preprocessing import StandardScaler from atom import show_versions -from atom.pipeline import Pipeline -from atom.utils.patches import VotingClassifier, VotingRegressor -from atom.utils.utils import ClassMap, check_is_fitted, time_to_str - -from .conftest import X_bin, X_reg, y_bin, y_reg - - -@pytest.fixture() -def classifiers(): - """Get a list of classifiers for the ensemble.""" - return [ - ("lda", LinearDiscriminantAnalysis().fit(X_bin, y_bin)), - ("placeholder1", "drop"), - ( - "pl", - Pipeline( - [ - ("scaler", StandardScaler()), - ("et", ExtraTreesClassifier(n_estimators=5)), - ], - ).fit(X_bin, y_bin), - ), - ] - - -@pytest.fixture() -def regressors(): - """Get a list of regressors for the ensemble.""" - return [ - ("ols", LinearRegression()), - ("placeholder1", "drop"), - ( - "pl", - Pipeline( - [ - ("scaler", StandardScaler()), - ("et", ExtraTreesRegressor(n_estimators=5)), - ], - ), - ), - ] +from atom.utils.utils import ClassMap, time_to_str # Test _show_versions ============================================== >> @@ -67,64 +22,6 @@ def test_show_versions(): show_versions() -# Test patches ===================================================== >> - -def test_voting_initialized_fitted(classifiers): - """Assert that the model can be fit at initialization.""" - vote = VotingClassifier(estimators=classifiers) - assert check_is_fitted(vote, exception=False) - - -def test_voting_multilabel(classifiers): - """Assert that an error is raised for multilabel targets.""" - vote = VotingClassifier(estimators=classifiers) - with pytest.raises(NotImplementedError, match=".*Multilabel.*"): - vote.fit(X_bin, np.array([[0, 1], [1, 0]])) - - -def test_voting_invalid_type(classifiers): - """Assert that an error is raised when voting type is invalid.""" - vote = VotingClassifier(estimators=classifiers, voting="invalid") - with pytest.raises(ValueError, match=".*must be 'soft'.*"): - vote.fit(X_bin, y_bin) - - -def test_voting_invalid_weights(classifiers): - """Assert that an error is raised when weights have invalid length.""" - vote = VotingClassifier(estimators=classifiers, weights=[0, 1]) - with pytest.raises(ValueError, match=".*estimators and weights.*"): - vote.fit(X_bin, y_bin) - - -def test_voting_mixed_fit_and_not(classifiers): - """Assert that fitted and non-fitted models can be used both.""" - estimators = classifiers.copy() - estimators.append(("not_fitted_lda", LinearDiscriminantAnalysis())) - - vote = VotingClassifier(estimators=estimators) - assert not check_is_fitted(vote, exception=False) - vote.fit(X_bin, y_bin) - assert check_is_fitted(vote, exception=False) - assert len(vote.estimators_) == 3 - assert vote.estimators_[0] is estimators[0][1] # Fitted is same - assert vote.estimators_[2] is not estimators[2][1] # Unfitted changes - - -@pytest.mark.parametrize("voting", ["soft", "hard"]) -def test_voting_predict(classifiers, voting): - """Assert that the predict method doesn't use the encoder.""" - vote = VotingClassifier(estimators=classifiers, voting=voting) - assert isinstance(vote.predict(X_bin), np.ndarray) - - -def test_voting_regressor(regressors): - """Assert that the regressor works.""" - vote = VotingRegressor(estimators=regressors) - assert not check_is_fitted(vote, exception=False) - vote.fit(X_reg, y_reg) - assert check_is_fitted(vote, exception=False) - - # Test utils ======================================================= >> def test_classmap_failed_initialization():