From eabeea86341b3917822d818bbd08623dc3ac8705 Mon Sep 17 00:00:00 2001 From: Mavs Date: Fri, 15 Dec 2023 21:11:35 +0100 Subject: [PATCH] add sp --- .github/CONTRIBUTING.md | 2 +- atom/api.py | 37 +- atom/atom.py | 24 +- atom/basemodel.py | 82 ++- atom/baserunner.py | 221 ++++++-- atom/basetrainer.py | 7 +- atom/basetransformer.py | 11 +- atom/data_cleaning.py | 85 +-- atom/feature_engineering.py | 18 +- atom/models/__init__.py | 76 +-- atom/models/classreg.py | 159 +++--- atom/models/custom.py | 3 +- atom/models/ensembles.py | 6 +- atom/models/ts.py | 496 ++++++++++++++++-- atom/nlp.py | 21 +- atom/plots/dataplot.py | 2 +- atom/utils/types.py | 10 +- atom/utils/utils.py | 26 +- docs_sources/api/models/adab.md | 10 +- docs_sources/api/models/ard.md | 10 +- docs_sources/api/models/arima.md | 10 +- docs_sources/api/models/autoarima.md | 10 +- docs_sources/api/models/bag.md | 10 +- docs_sources/api/models/bats.md | 80 +++ docs_sources/api/models/bnb.md | 10 +- docs_sources/api/models/br.md | 10 +- docs_sources/api/models/catb.md | 10 +- docs_sources/api/models/catnb.md | 10 +- docs_sources/api/models/cnb.md | 10 +- docs_sources/api/models/croston.md | 80 +++ docs_sources/api/models/dummy.md | 10 +- docs_sources/api/models/en.md | 10 +- docs_sources/api/models/es.md | 10 +- docs_sources/api/models/et.md | 10 +- docs_sources/api/models/etree.md | 10 +- docs_sources/api/models/ets.md | 10 +- docs_sources/api/models/gbm.md | 10 +- docs_sources/api/models/gnb.md | 10 +- docs_sources/api/models/gp.md | 10 +- docs_sources/api/models/hgbm.md | 10 +- docs_sources/api/models/huber.md | 10 +- docs_sources/api/models/knn.md | 10 +- docs_sources/api/models/lars.md | 10 +- docs_sources/api/models/lasso.md | 10 +- docs_sources/api/models/lda.md | 10 +- docs_sources/api/models/lgb.md | 10 +- docs_sources/api/models/lr.md | 10 +- docs_sources/api/models/lsvm.md | 10 +- docs_sources/api/models/mlp.md | 10 +- docs_sources/api/models/mnb.md | 10 +- docs_sources/api/models/nf.md | 10 +- docs_sources/api/models/ols.md | 10 +- docs_sources/api/models/omp.md | 10 +- docs_sources/api/models/pa.md | 10 +- docs_sources/api/models/perc.md | 10 +- docs_sources/api/models/pt.md | 10 +- docs_sources/api/models/qda.md | 10 +- docs_sources/api/models/rf.md | 10 +- docs_sources/api/models/ridge.md | 10 +- docs_sources/api/models/rnn.md | 10 +- docs_sources/api/models/sgd.md | 10 +- docs_sources/api/models/stl.md | 80 +++ docs_sources/api/models/svm.md | 10 +- docs_sources/api/models/tbats.md | 80 +++ docs_sources/api/models/theta.md | 80 +++ docs_sources/api/models/tree.md | 10 +- docs_sources/api/models/xgb.md | 10 +- docs_sources/contributing.md | 2 +- docs_sources/dependencies.md | 3 +- docs_sources/examples/deep_learning.ipynb | 2 +- .../examples/in_training_validation.ipynb | 8 +- .../examples/multioutput_regression.ipynb | 2 +- docs_sources/scripts/autodocs.py | 54 +- docs_sources/user_guide/models.md | 48 +- docs_sources/user_guide/time_series.md | 10 + docs_sources/user_guide/training.md | 2 +- examples/deep_learning.ipynb | 2 +- examples/in_training_validation.ipynb | 8 +- examples/multioutput_regression.ipynb | 2 +- mkdocs.yml | 27 +- pyproject.toml | 1 + tests/test_api.py | 2 +- tests/test_basemodel.py | 197 ++++--- tests/test_baserunner.py | 7 + tests/test_models.py | 10 +- 85 files changed, 1801 insertions(+), 712 deletions(-) create mode 100644 docs_sources/api/models/bats.md create mode 100644 docs_sources/api/models/croston.md create mode 100644 docs_sources/api/models/stl.md create mode 100644 docs_sources/api/models/tbats.md create mode 100644 docs_sources/api/models/theta.md diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 6894372fe..108de5968 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -91,7 +91,7 @@ and accept your changes. * Update the documentation so all of your changes are reflected there. * Adhere to [PEP 8](https://peps.python.org/pep-0008/) standards. * Use a maximum of 99 characters per line. Try to keep docstrings below - 74 characters. + 80 characters. * Update the project unit tests to test your code changes as thoroughly as possible. * Make sure that your code is properly commented with docstrings and diff --git a/atom/api.py b/atom/api.py index 9fc0918f2..55537c62e 100644 --- a/atom/api.py +++ b/atom/api.py @@ -17,8 +17,9 @@ from atom.atom import ATOM from atom.utils.types import ( - Backend, Bool, ColumnSelector, Engine, IndexSelector, IntLargerEqualZero, - NJobs, Predictor, Scalar, Verbose, Warnings, YSelector, + Backend, Bool, ColumnSelector, Engine, IndexSelector, Int, + IntLargerEqualZero, NJobs, Predictor, Scalar, Sequence, Verbose, Warnings, + YSelector, ) from atom.utils.utils import Goal @@ -35,13 +36,13 @@ def ATOMModel( needs_scaling: Bool = False, native_multilabel: Bool = False, native_multioutput: Bool = False, - has_validation: str | None = None, + validation: str | None = None, ) -> T_Predictor: """Convert an estimator to a model that can be ingested by atom. - This function adds the relevant attributes to the estimator so - that they can be used by atom. Note that only estimators that - follow [sklearn's API][api] are compatible. + This function adds the relevant tags to the estimator so that they + can be used by `atom`. Note that only estimators that follow + [sklearn's API][api] are compatible. Read more about custom models in the [user guide][custom-models]. @@ -75,7 +76,7 @@ def ATOMModel( If False and the task is multioutput, a multioutput meta-estimator is wrapped around the estimator. - has_validation: str or None, default=None + validation: str or None, default=None Whether the model allows [in-training validation][]. - If None: No support for in-training validation. @@ -121,7 +122,7 @@ def ATOMModel( estimator_c.needs_scaling = needs_scaling estimator_c.native_multioutput = native_multioutput estimator_c.native_multilabel = native_multilabel - estimator_c.has_validation = has_validation + estimator_c.validation = validation return estimator_c @@ -453,6 +454,24 @@ class ATOMForecaster(ATOM): and model training. The features are still used in the remaining methods. + sp: int, str, sequence or None, default=None + [Seasonal period][seasonality] of the time series. + + - If None: No seasonal period. + - If int: Seasonal period, e.g., 7 for weekly data, and 12 for + monthly data. + - If str: + + - Seasonal period provided as [PeriodAlias][], e.g., "M" for + 12 or "H" for 24. + - "index": The frequency of the data index is mapped to a + seasonal period. + - "infer": Automatically infer the seasonal period from the + data (calls [get_seasonal_period][self-get_seasonal_period] + under the hood, using default parameters). + + - If sequence: Multiple seasonal periods provided as int or str. + test_size: int or float, default=0.2 - If <=1: Fraction of the dataset to include in the test set. - If >1: Number of rows to include in the test set. @@ -592,6 +611,7 @@ def __init__( *arrays, y: YSelector = -1, ignore: ColumnSelector | None = None, + sp: Int | str | Sequence[Int | str] | None = None, n_rows: Scalar = 1, test_size: Scalar = 0.2, holdout_size: Scalar | None = None, @@ -611,6 +631,7 @@ def __init__( y=y, index=True, ignore=ignore, + sp=sp, test_size=test_size, holdout_size=holdout_size, shuffle=False, diff --git a/atom/atom.py b/atom/atom.py index 95ed7274c..aecf632b3 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -56,7 +56,7 @@ NJobs, NormalizerStrats, NumericalStrats, Operators, Pandas, PrunerStrats, RowSelector, Scalar, ScalerStrats, Sequence, Series, TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings, XSelector, YSelector, - sequence_t, tsindex_t, + sequence_t, ) from atom.utils.utils import ( ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, bk, @@ -95,6 +95,7 @@ def __init__( y: YSelector = -1, index: IndexSelector = False, ignore: ColumnSelector | None = None, + sp: Int | str | Sequence[Int | str] | None = None, shuffle: Bool = True, stratify: IndexSelector = True, n_rows: Scalar = 1, @@ -133,18 +134,19 @@ def __init__( holdout_size=holdout_size, ) - self._log("<< ================== ATOM ================== >>", 1) - # Initialize the branch system and fill with data self._branches = BranchManager(memory=self.memory) self._branches.fill(*self._get_data(arrays, y=y)) self.ignore = ignore # type: ignore[assignment] + self.sp = sp # type: ignore[assignment] + self.missing = DEFAULT_MISSING self._models = ClassMap() self._metric = ClassMap() + self._log("<< ================== ATOM ================== >>", 1) self._log("\nConfiguration ==================== >>", 1) self._log(f"Algorithm task: {self.task}.", 1) if self.n_jobs > 1: @@ -747,8 +749,8 @@ def load(cls, filename: str | Path, data: tuple[Any, ...] | None = None) -> ATOM if atom._config.index is False: branch._container = DataContainer( data=(dataset := branch._container.data.reset_index(drop=True)), - train_idx=dataset.index[: len(branch._container.train_idx)], - test_idx=dataset.index[-len(branch._container.test_idx) :], + train_idx=dataset.index[:len(branch._container.train_idx)], + test_idx=dataset.index[-len(branch._container.test_idx):], n_cols=branch._container.n_cols, ) @@ -956,11 +958,13 @@ def stats(self, _vb: Int = -2, /): """ self._log("Dataset stats " + "=" * 20 + " >>", _vb) self._log(f"Shape: {self.shape}", _vb) + if self.task.is_forecast and self.sp: + self._log(f"Seasonal period: {self.sp}", _vb) - for set_ in ("train", "test", "holdout"): - if (data := getattr(self, set_)) is not None: - self._log(f"{set_.capitalize()} set size: {len(data)}", _vb) - if isinstance(self.branch.train.index, tsindex_t): + for ds in ("train", "test", "holdout"): + if (data := getattr(self, ds)) is not None: + self._log(f"{ds.capitalize()} set size: {len(data)}", _vb) + if self.task.is_forecast: self._log(f" --> From: {min(data.index)} To: {max(data.index)}", _vb) self._log("-" * 37, _vb) @@ -1231,7 +1235,7 @@ def _add_transformer( self.branch._container = DataContainer( data=(data := self.dataset.reset_index(drop=True)), train_idx=data.index[: len(self.branch._data.train_idx)], - test_idx=data.index[-len(self.branch._data.test_idx) :], + test_idx=data.index[-len(self.branch._data.test_idx):], n_cols=self.branch._data.n_cols, ) if self.branch._holdout is not None: diff --git a/atom/basemodel.py b/atom/basemodel.py index 8e539e91c..71e89db65 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -253,13 +253,23 @@ def __init__( self._branch = branches.current self._train_idx = len(self.branch._data.train_idx) # Can change for sh and ts - if self.needs_scaling and not check_scaling(self.X, pipeline=self.pipeline): - self.scaler = Scaler().fit(self.X_train) + if hasattr(self, "needs_scaling"): + if self.needs_scaling and not check_scaling(self.X, pipeline=self.pipeline): + self.scaler = Scaler().fit(self.X_train) def __repr__(self) -> str: """Display class name.""" return f"{self.__class__.__name__}()" + def __dir__(self) -> list[str]: + """Add additional attrs from __getattr__ to the dir.""" + attrs = list(super().__dir__()) + if "_branch" in self.__dict__: + attrs += [x for x in dir(self.branch) if not x.startswith("_")] + attrs += list(DF_ATTRS) + attrs += list(self.columns) + return attrs + def __getattr__(self, item: str) -> Any: """Get attributes from branch or data.""" if "_branch" in self.__dict__: @@ -449,9 +459,10 @@ def _get_est(self, params: dict[str, Any]) -> Predictor: estimator = MultiOutputClassifier(estimator) elif self.task.is_regression: estimator = MultiOutputRegressor(estimator) - elif hasattr(self, "_estimators") and self._goal.name not in self._estimators: - # Forecasting task with a regressor - estimator = make_reduction(estimator) + elif self.task.is_forecast: + if hasattr(self, "_estimators") and self._goal.name not in self._estimators: + # Forecasting task with a regressor + estimator = make_reduction(estimator) return self._inherit(estimator) @@ -494,13 +505,13 @@ def _fit_estimator( Fitted instance. """ - if self.has_validation and hasattr(estimator, "partial_fit") and validation: + if getattr(self, "validation", False) and hasattr(estimator, "partial_fit") and validation: # Loop over first parameter in estimator try: - steps = estimator.get_params()[self.has_validation] + steps = estimator.get_params()[self.validation] except KeyError: # For meta-estimators like multioutput - steps = estimator.get_params()[f"estimator__{self.has_validation}"] + steps = estimator.get_params()[f"estimator__{self.validation}"] for step in range(steps): kwargs = {} @@ -533,8 +544,8 @@ def _fit_estimator( if trial.should_prune(): # Hacky solution to add the pruned step to the output - if self.has_validation in trial.params: - trial.params[self.has_validation] = f"{step}/{steps}" + if self.validation in trial.params: + trial.params[self.validation] = f"{step}/{steps}" trial.set_user_attr("estimator", estimator) raise TrialPruned @@ -1308,7 +1319,7 @@ def name(self, value: str): """Change the model's name.""" # Drop the acronym if provided by the user if re.match(f"{self.acronym}_", value, re.I): - value = value[len(self.acronym) + 1 :] + value = value[len(self.acronym) + 1:] # Add the acronym in front (with right capitalization) self._name = f"{self.acronym}{f'_{value}' if value else ''}" @@ -2437,6 +2448,32 @@ def transform( class ClassRegModel(BaseModel): """Classification and regression models.""" + def get_tags(self) -> dict[str, Any]: + """Get the model's tags. + + Return class parameters that provide general information about + the estimator's characteristics. + + Returns + ------- + dict + Model's tags. + + """ + return { + "acronym": self.acronym, + "fullname": self.fullname, + "estimator": self._est_class, + "module": self._est_class.__module__.split(".")[0] + self._module, + "handles_missing": self.handles_missing, + "needs_scaling": self.needs_scaling, + "accepts_sparse": self.accepts_sparse, + "native_multilabel": self.native_multilabel, + "native_multioutput": self.native_multioutput, + "validation": self.validation, + "supports_engines": ", ".join(self.supports_engines), + } + @overload def _prediction( self, @@ -2845,6 +2882,29 @@ def score( class ForecastModel(BaseModel): """Forecasting models.""" + def get_tags(self) -> dict[str, Any]: + """Get the model's tags. + + Return class parameters that provide general information about + the estimator's characteristics. + + Returns + ------- + dict + Model's tags. + + """ + return { + "acronym": self.acronym, + "fullname": self.fullname, + "estimator": self._est_class.__name__, + "module": self._est_class.__module__.split(".")[0] + self._module, + "handles_missing": self.handles_missing, + "in_sample_prediction": self.in_sample_prediction, + "native_multivariate": self.native_multivariate, + "supports_engines": ", ".join(self.supports_engines), + } + @overload def _prediction( self, diff --git a/atom/baserunner.py b/atom/baserunner.py index b776749ac..7019bac81 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -7,6 +7,7 @@ from __future__ import annotations +import math import random import re from abc import ABCMeta @@ -17,12 +18,17 @@ from typing import Any import dill as pickle +import numpy as np import pandas as pd from beartype import beartype +from pandas.tseries.frequencies import to_offset +from pmdarima.arima.utils import ndiffs from sklearn.model_selection import train_test_split from sklearn.utils.class_weight import compute_sample_weight from sklearn.utils.metaestimators import available_if from sktime.datatypes import check_is_mtype +from sktime.param_est.seasonality import SeasonalityACF +from sktime.transformations.series.difference import Differencer from atom.basetracker import BaseTracker from atom.basetransformer import BaseTransformer @@ -31,14 +37,15 @@ from atom.pipeline import Pipeline from atom.utils.constants import DF_ATTRS from atom.utils.types import ( - Bool, DataFrame, FloatZeroToOneExc, Int, MetricConstructor, Model, - ModelSelector, ModelsSelector, Pandas, RowSelector, Scalar, Segment, - Sequence, Series, YSelector, dataframe_t, int_t, segment_t, sequence_t, + Bool, DataFrame, FloatZeroToOneExc, HarmonicsSelector, Int, + MetricConstructor, Model, ModelSelector, ModelsSelector, Pandas, + RowSelector, Scalar, Segment, Sequence, Series, YSelector, dataframe_t, + int_t, segment_t, sequence_t, ) from atom.utils.utils import ( - ClassMap, DataContainer, Task, bk, check_is_fitted, composed, crash, - divide, flt, get_cols, get_segment, get_versions, has_task, lst, merge, - method_to_log, n_cols, + ClassMap, DataContainer, SeasonalPeriod, Task, bk, check_is_fitted, + composed, crash, divide, flt, get_cols, get_segment, get_versions, + has_task, lst, merge, method_to_log, n_cols, ) @@ -70,6 +77,17 @@ def __setstate__(self, state: dict[str, Any]): severity="warning", ) + def __dir__(self) -> list[str]: + """Add additional attrs from __getattr__ to the dir.""" + attrs = list(super().__dir__()) + attrs += [x for x in dir(self.branch) if not x.startswith("_")] + attrs += list(DF_ATTRS) + attrs += [b.name.lower() for b in self._branches] + attrs += list(self.columns) + if isinstance(self._models, ClassMap): + attrs += [m.name.lower() for m in self._models] + return attrs + def __getattr__(self, item: str) -> Any: """Get branch, attr from branch, model, column or attr from dataset.""" if item in self.__dict__["_branches"]: @@ -138,6 +156,33 @@ def task(self) -> Task: """Dataset's [task][] type.""" return self._goal.infer_task(self.y) + @property + def sp(self) -> int | list[int] | None: + """Seasonal period(s) of the time series. + + Read more about seasonality in the [user guide][seasonality]. + + """ + return self._sp + + @sp.setter + def sp(self, sp: Int | str | Sequence[Int | str] | None): + """Convert seasonal period to integer value.""" + if sp is None: + self._sp = None + elif sp == "index": + if not hasattr(self.dataset.index, "freqstr"): + raise ValueError( + f"Invalid value for the seasonal period, got {sp}. " + f"The dataset's index has no attribute freqstr." + ) + else: + self._sp = self.dataset.index.freqstr + elif sp == "infer": + self._sp = self.get_seasonal_period() + else: + self._sp = flt([self._get_sp(x) for x in lst(sp)]) + @property def og(self) -> Branch: """Branch containing the original dataset. @@ -269,6 +314,36 @@ def frac(m: Model) -> float: # Utility methods ============================================== >> + @staticmethod + def _get_sp(sp: Int | str) -> int: + """Get the seasonal period from a value or string. + + Parameters + ---------- + sp: int or str + Seasonal period provided as int or [DateOffset][]. + + Returns + ------- + int + Seasonal period. + + """ + if isinstance(sp, str): + if offset := to_offset(sp): # Convert to pandas' DateOffset + name, period = offset.name.split("-")[0], offset.n + + if name not in SeasonalPeriod.__members__: + raise ValueError( + f"Invalid value for the seasonal period, got {name}. " + f"Valid values are: {', '.join(SeasonalPeriod.__members__)}" + ) + + # Formula is same as SeasonalPeriod[name] for period=1 + return math.lcm(SeasonalPeriod[name].value, period) // period + else: + return int(sp) + def _set_index(self, df: DataFrame, y: Pandas | None) -> DataFrame: """Assign an index to the dataframe. @@ -482,8 +557,8 @@ def _no_data_sets( container = DataContainer( data=(data := complete_set.iloc[: len(data)]), - train_idx=data.index[: -len(test)], - test_idx=data.index[-len(test) :], + train_idx=data.index[:-len(test)], + test_idx=data.index[-len(test):], n_cols=len(get_cols(y)), ) @@ -500,7 +575,7 @@ def _no_data_sets( raise ex if holdout is not None: - holdout = complete_set.iloc[len(data) :] + holdout = complete_set.iloc[len(data):] return container, holdout @@ -589,19 +664,19 @@ def _has_data_sets( train.index = self._config.index[: len(train)] test.index = self._config.index[len(train) : len(train) + len(test)] if holdout is not None: - holdout.index = self._config.index[-len(holdout) :] + holdout.index = self._config.index[-len(holdout):] complete_set = self._set_index(bk.concat([train, test, holdout]), y_test) container = DataContainer( - data=(data := complete_set.iloc[: len(train) + len(test)]), + data=(data := complete_set.iloc[:len(train) + len(test)]), train_idx=data.index[: len(train)], - test_idx=data.index[-len(test) :], + test_idx=data.index[-len(test):], n_cols=len(get_cols(y_train)), ) if holdout is not None: - holdout = complete_set.iloc[len(train) + len(test) :] + holdout = complete_set.iloc[len(train) + len(test):] return container, holdout @@ -819,20 +894,27 @@ def available_models(self) -> pd.DataFrame: Returns ------- pd.DataFrame - Information about the available [predefined models][]. Columns - include: + Tags of the available [predefined models][]. The columns + depend on the task, but can include: - **acronym:** Model's acronym (used to call the model). - - **model:** Name of the model's class. - - **estimator:** The model's underlying estimator. + - **fullname:** Name of the model's class. + - **estimator:** Class of the model's underlying estimator. - **module:** The estimator's module. + - **handles_missing:** Whether the model can handle `NaN` values + without preprocessing. - **needs_scaling:** Whether the model requires feature scaling. - **accepts_sparse:** Whether the model accepts sparse matrices. + - **uses_exogenous:** Whether the model uses exogenous variables. + - **in_sample_prediction:** Whether the model can do predictions + on the training set. - **native_multilabel:** Whether the model has native support for [multilabel][] tasks. - **native_multioutput:** Whether the model has native support for [multioutput tasks][]. - - **has_validation:** Whether the model has [in-training validation][]. + - **native_multivariate:** Whether the model has native support + for [multivariate][] tasks. + - **validation:** Whether the model has [in-training validation][]. - **supports_engines:** Engines supported by the model. """ @@ -840,20 +922,7 @@ def available_models(self) -> pd.DataFrame: for model in MODELS: m = model(goal=self._goal) if self._goal.name in m._estimators: - rows.append( - { - "acronym": m.acronym, - "model": m.fullname, - "estimator": m._est_class.__name__, - "module": m._est_class.__module__.split(".")[0] + m._module, - "needs_scaling": m.needs_scaling, - "accepts_sparse": m.accepts_sparse, - "native_multilabel": m.native_multilabel, - "native_multioutput": m.native_multioutput, - "has_validation": bool(m.has_validation), - "supports_engines": ", ".join(m.supports_engines), - } - ) + rows.append(m.get_tags()) return pd.DataFrame(rows) @@ -1051,6 +1120,94 @@ def get_sample_weight(self, rows: RowSelector = "train") -> Series: weights = compute_sample_weight("balanced", y=y) return bk.Series(weights, name="sample_weight").round(3) + @available_if(has_task("forecast")) + @composed(crash, beartype) + def get_seasonal_period( + self, + max_sp: Int | None = None, + harmonics: HarmonicsSelector | None = None, + ) -> int: + """Get the seasonal periods of the time series. + + Use the data in the training set to calculate the seasonal + period. The data is internally differentiated before the + seasonality is detected using ACF. + + !!! tip + Read more about seasonality in the [user guide][seasonality]. + + Parameters + ---------- + max_sp: int or None, default=None + Maximum seasonal period to consider. If None, the maximum + period is given by `(len(y_train) - 1) // 2`. + + harmonics: str or None, default=None + Defines the strategy on how to deal with harmonics from the + detected seasonal periods. Choose from the following options: + + - None: The detected seasonal periods are left unchanged + (no harmonic removal). + - "drop": Remove all harmonics. + - "raw_strength": Keep the highest order harmonics, maintaining + the order of significance. + - "harmonic_strength": Replace seasonal periods with their highest + harmonic. + + E.g., if the detected seasonal periods in strength order are + `[2, 3, 4, 7, 8]` (note that 4 and 8 are harmonics of 2), then: + + - If "drop", result=[2, 3, 7] + - If "raw_strength", result=[3, 7, 8] + - If "harmonic_strength", result=[8, 3, 7] + + Returns + ------- + list of int + Seasonal periods, ordered by significance. + + """ + yt = self.y_train.copy() + max_sp = max_sp or (len(yt) - 1) // 2 + + for _ in np.arange(ndiffs(yt)): + yt = Differencer().fit_transform(yt) + + acf = SeasonalityACF(nlags=max_sp).fit(pd.DataFrame(yt)) + seasonal_periods = acf.get_fitted_params().get("sp_significant") + + if harmonics and len(seasonal_periods) > 1: + # Create a dictionary of the seasonal periods and their harmonics + harmonic_dict: dict[int, list[int]] = {} + for sp in seasonal_periods: + for k in harmonic_dict: + if sp % k == 0: + harmonic_dict[k].append(sp) + break + else: + harmonic_dict[sp] = [] + + # For periods without harmonics, simplify operations + # by setting the value of the key to itself + harmonic_dict = {k: (v or [k]) for k, v in harmonic_dict.items()} + + if harmonics == "drop": + seasonal_periods = list(harmonic_dict.keys()) + elif harmonics == "raw_strength": + seasonal_periods = [ + sp for sp in seasonal_periods + if any(max(v) == sp for v in harmonic_dict.values()) + ] + elif harmonics == "harmonic_strength": + seasonal_periods = [max(v) for v in harmonic_dict.values()] + + if not (seasonal_periods := [int(sp) for sp in seasonal_periods if sp <= max_sp]): + raise ValueError( + "No seasonal periods were detected. Try decreasing the max_sp parameter." + ) + + return flt(seasonal_periods) + @composed(crash, method_to_log, beartype) def merge(self, other: BaseRunner, /, suffix: str = "2"): """Merge another instance of the same class into this one. diff --git a/atom/basetrainer.py b/atom/basetrainer.py index 63e730cc0..0a9f63110 100644 --- a/atom/basetrainer.py +++ b/atom/basetrainer.py @@ -182,7 +182,7 @@ def _prepare_parameters(self): raise ValueError( f"Invalid value for the models parameter, got {m}. " "Note that tags must be separated by an underscore. " - f"Available model are:\n" + "Available model are:\n" + "\n".join( [ f" --> {m.__name__} ({m.acronym})" @@ -195,8 +195,11 @@ def _prepare_parameters(self): # Check if libraries for non-sklearn models are available dependencies = { "ARIMA": "pmdarima", - "Catb": "castboost", + "AutoARIMA": "pmdarima", + "BATS": "tbats", + "CatB": "catboost", "LGB": "lightgbm", + "TBATS": "tbats", "XGB": "xgboost", } if cls.acronym in dependencies: diff --git a/atom/basetransformer.py b/atom/basetransformer.py index c75501c2f..f23180cc1 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -353,10 +353,11 @@ def _device_id(self) -> int: # Methods ====================================================== >> def _inherit(self, obj: T_Estimator) -> T_Estimator: - """Inherit n_jobs and/or random_state from parent. + """Inherit parameters from parent. - Utility method to set the n_jobs and random_state parameters - of an estimator (if available) equal to that of this instance. + Utility method to set the sp (seasonal period), n_jobs and + random_state parameters of an estimator (if available) equal + to that of this instance. Parameters ---------- @@ -370,9 +371,9 @@ def _inherit(self, obj: T_Estimator) -> T_Estimator: """ signature = sign(obj.__init__) # type: ignore[misc] - for p in ("n_jobs", "random_state"): + for p in ("sp", "n_jobs", "random_state"): if p in signature and getattr(obj, p, "") == signature[p]._default: - setattr(obj, p, getattr(self, p)) + setattr(obj, p, getattr(self, p, signature[p]._default)) return obj diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index 7fd488e2e..cf276d987 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -274,13 +274,13 @@ class Balancer(TransformerMixin): Target values mapped to their respective encoded integers. feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. target_names_in_: np.ndarray - Names of the target column seen during fit. + Names of the target column seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -527,9 +527,9 @@ def log_changes(y): ] # Select the new samples and assign the new indices - X_new = X_new.iloc[-len(X_new) + len(o_samples) :] + X_new = X_new.iloc[-len(X_new) + len(o_samples):] X_new.index = n_idx - y_new = y_new.iloc[-len(y_new) + len(o_samples) :] + y_new = y_new.iloc[-len(y_new) + len(o_samples):] y_new.index = n_idx # First, output the samples created @@ -603,21 +603,22 @@ class Cleaner(TransformerMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: dict, default={"data": "numpy", "estimator": "sklearn"} + engine: dict or None, default=None Execution engine to use for [data][data-acceleration] and [estimators][estimator-acceleration]. The value should be a dictionary with keys `data` and/or `estimator`, with their - corresponding choice as values. Choose from: + corresponding choice as values. If None, the default values + are used.Choose from: - "data": - - "numpy" + - "numpy" (default) - "pyarrow" - "modin" - "estimator": - - "sklearn" + - "sklearn" (default) - "cuml" verbose: int, default=0 @@ -646,13 +647,13 @@ class Cleaner(TransformerMixin): available if encode_target=True. feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. target_names_in_: np.ndarray - Names of the target column(s) seen during fit. + Names of the target column(s) seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -1031,21 +1032,22 @@ class Discretizer(TransformerMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: dict, default={"data": "numpy", "estimator": "sklearn"} + engine: dict or None, default=None Execution engine to use for [data][data-acceleration] and [estimators][estimator-acceleration]. The value should be a dictionary with keys `data` and/or `estimator`, with their - corresponding choice as values. Choose from: + corresponding choice as values. If None, the default values + are used.Choose from: - "data": - - "numpy" + - "numpy" (default) - "pyarrow" - "modin" - "estimator": - - "sklearn" + - "sklearn" (default) - "cuml" verbose: int, default=0 @@ -1068,10 +1070,10 @@ class Discretizer(TransformerMixin): Attributes ---------- feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -1398,10 +1400,10 @@ class Encoder(TransformerMixin): the key to its mapping dictionary. Only for ordinal encoding. feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -1731,21 +1733,22 @@ class Imputer(TransformerMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: dict, default={"data": "numpy", "estimator": "sklearn"} + engine: dict or None, default=None Execution engine to use for [data][data-acceleration] and [estimators][estimator-acceleration]. The value should be a dictionary with keys `data` and/or `estimator`, with their - corresponding choice as values. Choose from: + corresponding choice as values. If None, the default values + are used.Choose from: - "data": - - "numpy" + - "numpy" (default) - "pyarrow" - "modin" - "estimator": - - "sklearn" + - "sklearn" (default) - "cuml" verbose: int, default=0 @@ -1775,10 +1778,10 @@ class Imputer(TransformerMixin): with sklearn estimators. feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -2132,21 +2135,22 @@ class Normalizer(TransformerMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: dict, default={"data": "numpy", "estimator": "sklearn"} + engine: dict or None, default=None Execution engine to use for [data][data-acceleration] and [estimators][estimator-acceleration]. The value should be a dictionary with keys `data` and/or `estimator`, with their - corresponding choice as values. Choose from: + corresponding choice as values. If None, the default values + are used.Choose from: - "data": - - "numpy" + - "numpy" (default) - "pyarrow" - "modin" - "estimator": - - "sklearn" + - "sklearn" (default) - "cuml" verbose: int, default=0 @@ -2174,10 +2178,10 @@ class Normalizer(TransformerMixin): `normalizer.yeojohnson` for the default strategy. feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -2453,10 +2457,10 @@ class Pruner(TransformerMixin): isolation forest strategy. Not available for strategy="zscore". feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -2708,21 +2712,22 @@ class Scaler(TransformerMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: dict, default={"data": "numpy", "estimator": "sklearn"} + engine: dict or None, default=None Execution engine to use for [data][data-acceleration] and [estimators][estimator-acceleration]. The value should be a dictionary with keys `data` and/or `estimator`, with their - corresponding choice as values. Choose from: + corresponding choice as values. If None, the default values + are used.Choose from: - "data": - - "numpy" + - "numpy" (default) - "pyarrow" - "modin" - "estimator": - - "sklearn" + - "sklearn" (default) - "cuml" verbose: int, default=0 @@ -2746,10 +2751,10 @@ class Scaler(TransformerMixin): `scaler.standard` for the default strategy. feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index 6d2aba266..65a284c89 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -109,10 +109,10 @@ class FeatureExtractor(TransformerMixin): Attributes ---------- feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -360,10 +360,10 @@ class FeatureGenerator(TransformerMixin): - **fitness:** Fitness score. feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -487,7 +487,7 @@ def fit(self, X: DataFrame, y: Pandas | None = None) -> Self: ) # Select the new features (dfs also returns originals) - self._dfs = self._dfs[X.shape[1] - 1 :] + self._dfs = self._dfs[X.shape[1] - 1:] # Get a random selection of features if self.n_features and self.n_features < len(self._dfs): @@ -638,10 +638,10 @@ class FeatureGrouper(TransformerMixin): Attributes ---------- feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -976,10 +976,10 @@ class FeatureSelector(TransformerMixin): strategy. feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- diff --git a/atom/models/__init__.py b/atom/models/__init__.py index 7bdbc8ab7..3e5e4a5b5 100644 --- a/atom/models/__init__.py +++ b/atom/models/__init__.py @@ -3,73 +3,6 @@ Author: Mavs Description: Module for models. -To add new models, note the following: - -1. Add the class in the right file depending on the task. -2. Models are ordered alphabetically. -3. Models have the following structure: - -Class attributes ----------------- -acronym: str - Acronym of the model's name. - -needs_scaling: bool - Whether the model needs scaled features. - -accepts_sparse: bool - Whether the model has native support for sparse matrices. - -native_multilabel: bool - Whether the model has native support for multilabel tasks. - -native_multioutput: bool - Whether the model has native support for multioutput tasks. - -has_validation: str or None - Whether the model allows in-training validation. If str, - name of the estimator's parameter that states the number - of iterations. If None, no support for in-training - validation. - -supports_engines: list - Engines that can be used to run this model. - -_module: str - Module from which to load the class. If one of engines, - ignore the engine name, i.e., use "ensemble" instead of - "sklearn.ensemble". - -_estimators: dict - Name of the estimators per goal. - -Instance attributes -------------------- -name: str - Name of the model. Defaults to the same as the acronym - but can be different if the same model is called multiple - times. The name is assigned in the basemodel.py module. - -Methods -------- -_get_parameters(self, x) -> dict: - Return the trial's suggestions with (optionally) custom changes - to the params. Don't implement if the parent's implementation - is sufficient. - -_trial_to_est(self, params) -> dict: - Convert trial's hyperparameters to parameters for the - estimator. Only implement for models whose study params are - different from those for the estimator. - -_fit_estimator(self, estimator, data, est_params_fit, validation, trial): - This method is called to fit the estimator. Implement only - to customize the fit. - -_get_distributions(self) -> dict: - Return a list of the hyperparameter distributions for - optimization. - """ from atom.models.classreg import ( @@ -87,8 +20,8 @@ from atom.models.custom import CustomModel from atom.models.ensembles import Stacking, Voting from atom.models.ts import ( - ARIMA, ETS, AutoARIMA, ExponentialSmoothing, NaiveForecaster, - PolynomialTrend, + ARIMA, BATS, ETS, STL, TBATS, AutoARIMA, Croston, ExponentialSmoothing, + NaiveForecaster, PolynomialTrend, Theta, ) from atom.utils.types import Predictor from atom.utils.utils import ClassMap @@ -101,11 +34,13 @@ AutoARIMA, AutomaticRelevanceDetermination, Bagging, + BATS, BayesianRidge, BernoulliNB, CatBoost, CategoricalNB, ComplementNB, + Croston, DecisionTree, Dummy, ElasticNet, @@ -137,8 +72,11 @@ RadiusNearestNeighbors, RandomForest, Ridge, + STL, StochasticGradientDescent, SupportVectorMachine, + TBATS, + Theta, XGBoost, key="acronym", ) diff --git a/atom/models/classreg.py b/atom/models/classreg.py index 12e091f8b..994ce35f6 100644 --- a/atom/models/classreg.py +++ b/atom/models/classreg.py @@ -63,11 +63,12 @@ class AdaBoost(ClassRegModel): """ acronym = "AdaB" + handles_missing = False needs_scaling = False accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "ensemble" @@ -133,11 +134,12 @@ class AutomaticRelevanceDetermination(ClassRegModel): """ acronym = "ARD" + handles_missing = False needs_scaling = True accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "linear_model" @@ -201,11 +203,12 @@ class Bagging(ClassRegModel): """ acronym = "Bag" + handles_missing = True needs_scaling = False accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "ensemble" @@ -267,11 +270,12 @@ class BayesianRidge(ClassRegModel): """ acronym = "BR" + handles_missing = False needs_scaling = True accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "linear_model" @@ -331,11 +335,12 @@ class BernoulliNB(ClassRegModel): """ acronym = "BNB" + handles_missing = False needs_scaling = False accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "cuml") _module = "naive_bayes" @@ -410,11 +415,12 @@ class CatBoost(ClassRegModel): """ acronym = "CatB" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = "n_estimators" + validation = "n_estimators" supports_engines = ("catboost",) _module = "catboost" @@ -532,8 +538,8 @@ def _fit_estimator( if trial and len(self._metric) == 1 and cb._pruned: # Add the pruned step to the output step = len(self.evals[f"{m}_train"]) - steps = estimator.get_params()[self.has_validation] - trial.params[self.has_validation] = f"{step}/{steps}" + steps = estimator.get_params()[self.validation] + trial.params[self.validation] = f"{step}/{steps}" trial.set_user_attr("estimator", estimator) raise TrialPruned(cb._message) @@ -597,11 +603,12 @@ class CategoricalNB(ClassRegModel): """ acronym = "CatNB" + handles_missing = False needs_scaling = False accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "cuml") _module = "naive_bayes" @@ -657,11 +664,12 @@ class ComplementNB(ClassRegModel): """ acronym = "CNB" + handles_missing = False needs_scaling = False accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "cuml") _module = "naive_bayes" @@ -717,11 +725,12 @@ class DecisionTree(ClassRegModel): """ acronym = "Tree" + handles_missing = True needs_scaling = False accepts_sparse = True native_multilabel = True native_multioutput = True - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "tree" @@ -792,11 +801,12 @@ class Dummy(ClassRegModel): """ acronym = "Dummy" + handles_missing = False needs_scaling = False accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "dummy" @@ -859,11 +869,12 @@ class ElasticNet(ClassRegModel): """ acronym = "EN" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "sklearnex", "cuml") _module = "linear_model" @@ -924,11 +935,12 @@ class ExtraTree(ClassRegModel): """ acronym = "ETree" + handles_missing = False needs_scaling = False accepts_sparse = True native_multilabel = True native_multioutput = True - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "tree" @@ -998,11 +1010,12 @@ class ExtraTrees(ClassRegModel): """ acronym = "ET" + handles_missing = False needs_scaling = False accepts_sparse = True native_multilabel = True native_multioutput = True - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "ensemble" @@ -1027,9 +1040,8 @@ def _get_parameters(self, trial: Trial) -> dict: """ params = super()._get_parameters(trial) - if not self._get_param("bootstrap", params): - if "max_samples" in params: - params["max_samples"] = None + if not self._get_param("bootstrap", params) and "max_samples" in params: + params["max_samples"] = None return params @@ -1094,11 +1106,12 @@ class GaussianNB(ClassRegModel): """ acronym = "GNB" + handles_missing = False needs_scaling = False accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "cuml") _module = "naive_bayes" @@ -1153,11 +1166,12 @@ class GaussianProcess(ClassRegModel): """ acronym = "GP" + handles_missing = False needs_scaling = False accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "gaussian_process" @@ -1209,11 +1223,12 @@ class GradientBoostingMachine(ClassRegModel): """ acronym = "GBM" + handles_missing = False needs_scaling = False accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "ensemble" @@ -1288,11 +1303,12 @@ class HuberRegression(ClassRegModel): """ acronym = "Huber" + handles_missing = False needs_scaling = True accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "linear_model" @@ -1354,11 +1370,12 @@ class HistGradientBoosting(ClassRegModel): """ acronym = "hGBM" + handles_missing = True needs_scaling = False accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "ensemble" @@ -1430,11 +1447,12 @@ class KNearestNeighbors(ClassRegModel): """ acronym = "KNN" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = True native_multioutput = True - has_validation = None + validation = None supports_engines = ("sklearn", "sklearnex", "cuml") _module = "neighbors" @@ -1502,11 +1520,12 @@ class Lasso(ClassRegModel): """ acronym = "Lasso" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "sklearnex", "cuml") _module = "linear_model" @@ -1565,11 +1584,12 @@ class LeastAngleRegression(ClassRegModel): """ acronym = "Lars" + handles_missing = False needs_scaling = True accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "linear_model" @@ -1620,11 +1640,12 @@ class LightGBM(ClassRegModel): """ acronym = "LGB" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = "n_estimators" + validation = "n_estimators" supports_engines = ("lightgbm",) _module = "lightgbm.sklearn" @@ -1720,8 +1741,8 @@ def _fit_estimator( # Add the pruned step to the output step = str(ex).split(" ")[-1][:-1] - steps = estimator.get_params()[self.has_validation] - trial.params[self.has_validation] = f"{step}/{steps}" + steps = estimator.get_params()[self.validation] + trial.params[self.validation] = f"{step}/{steps}" trial.set_user_attr("estimator", estimator) raise ex @@ -1793,11 +1814,12 @@ class LinearDiscriminantAnalysis(ClassRegModel): """ acronym = "LDA" + handles_missing = False needs_scaling = False accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "discriminant_analysis" @@ -1819,9 +1841,8 @@ def _get_parameters(self, trial: Trial) -> dict: """ params = super()._get_parameters(trial) - if self._get_param("solver", params) == "svd": - if "shrinkage" in params: - params["shrinkage"] = None + if self._get_param("solver", params) == "svd" and "shrinkage" in params: + params["shrinkage"] = None return params @@ -1877,11 +1898,12 @@ class LinearSVM(ClassRegModel): """ acronym = "lSVM" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "cuml") _module = "svm" @@ -1916,11 +1938,9 @@ def _get_parameters(self, trial: Trial) -> dict: params["dual"] = True elif self._get_param("loss", params) == "squared_hinge": # l1 regularization can't be combined with squared_hinge when dual=True - if self._get_param("penalty", params) == "l1": - if "dual" in params: - params["dual"] = False - elif self._get_param("loss", params) == "epsilon_insensitive": - if "dual" in params: + if self._get_param("penalty", params) == "l1" and "dual" in params: + params["dual"] = False + elif self._get_param("loss", params) == "epsilon_insensitive" and "dual" in params: params["dual"] = True return params @@ -2006,11 +2026,12 @@ class LogisticRegression(ClassRegModel): """ acronym = "LR" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "sklearnex", "cuml") _module = "linear_model" @@ -2039,9 +2060,8 @@ def _get_parameters(self, trial: Trial) -> dict: cond_2 = penalty == "l1" and solver not in ("liblinear", "saga") cond_3 = penalty == "elasticnet" and solver != "saga" - if cond_1 or cond_2 or cond_3: - if "penalty" in params: - params["penalty"] = "l2" # Change to default value + if cond_1 or cond_2 or cond_3 and "penalty" in params: + params["penalty"] = "l2" # Change to default value return params @@ -2113,11 +2133,12 @@ class MultiLayerPerceptron(ClassRegModel): """ acronym = "MLP" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = True native_multioutput = False - has_validation = "max_iter" + validation = "max_iter" supports_engines = ("sklearn",) _module = "neural_network" @@ -2219,11 +2240,12 @@ class MultinomialNB(ClassRegModel): """ acronym = "MNB" + handles_missing = False needs_scaling = False accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "cuml") _module = "naive_bayes" @@ -2281,11 +2303,12 @@ class OrdinaryLeastSquares(ClassRegModel): """ acronym = "OLS" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "sklearnex", "cuml") _module = "linear_model" @@ -2326,11 +2349,12 @@ class OrthogonalMatchingPursuit(ClassRegModel): """ acronym = "OMP" + handles_missing = False needs_scaling = True accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "linear_model" @@ -2373,11 +2397,12 @@ class PassiveAggressive(ClassRegModel): """ acronym = "PA" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = "max_iter" + validation = "max_iter" supports_engines = ("sklearn",) _module = "linear_model" @@ -2449,11 +2474,12 @@ class Perceptron(ClassRegModel): """ acronym = "Perc" + handles_missing = False needs_scaling = True accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = "max_iter" + validation = "max_iter" supports_engines = ("sklearn",) _module = "linear_model" @@ -2514,11 +2540,12 @@ class QuadraticDiscriminantAnalysis(ClassRegModel): """ acronym = "QDA" + handles_missing = False needs_scaling = False accepts_sparse = False native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "discriminant_analysis" @@ -2584,11 +2611,12 @@ class RadiusNearestNeighbors(ClassRegModel): """ acronym = "RNN" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = True native_multioutput = True - has_validation = None + validation = None supports_engines = ("sklearn",) _module = "neighbors" @@ -2660,11 +2688,12 @@ class RandomForest(ClassRegModel): """ acronym = "RF" + handles_missing = False needs_scaling = False accepts_sparse = True native_multilabel = True native_multioutput = True - has_validation = None + validation = None supports_engines = ("sklearn", "sklearnex", "cuml") _module = "ensemble" @@ -2689,9 +2718,8 @@ def _get_parameters(self, trial: Trial) -> dict: """ params = super()._get_parameters(trial) - if not self._get_param("bootstrap", params): - if "max_samples" in params: - params["max_samples"] = None + if not self._get_param("bootstrap", params) and "max_samples" in params: + params["max_samples"] = None return params @@ -2775,11 +2803,12 @@ class Ridge(ClassRegModel): """ acronym = "Ridge" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = True native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "sklearnex", "cuml") _module = "linear_model" @@ -2848,11 +2877,12 @@ class StochasticGradientDescent(ClassRegModel): """ acronym = "SGD" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = "max_iter" + validation = "max_iter" supports_engines = ("sklearn",) _module = "linear_model" @@ -2933,11 +2963,12 @@ class SupportVectorMachine(ClassRegModel): """ acronym = "SVM" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = None + validation = None supports_engines = ("sklearn", "sklearnex", "cuml") _module = "svm" @@ -2959,9 +2990,8 @@ def _get_parameters(self, trial: Trial) -> dict: """ params = super()._get_parameters(trial) - if self._get_param("kernel", params) == "poly": - if "gamma" in params: - params["gamma"] = "scale" # Crashes in combination with "auto" + if self._get_param("kernel", params) == "poly" and "gamma" in params: + params["gamma"] = "scale" # Crashes in combination with "auto" return params @@ -3053,11 +3083,12 @@ class XGBoost(ClassRegModel): """ acronym = "XGB" + handles_missing = False needs_scaling = True accepts_sparse = True native_multilabel = False native_multioutput = False - has_validation = "n_estimators" + validation = "n_estimators" supports_engines = ("xgboost",) _module = "xgboost" @@ -3174,8 +3205,8 @@ def _fit_estimator( # Add the pruned step to the output step = str(ex).split(" ")[-1][:-1] - steps = estimator.get_params()[self.has_validation] - trial.params[self.has_validation] = f"{step}/{steps}" + steps = estimator.get_params()[self.validation] + trial.params[self.validation] = f"{step}/{steps}" trial.set_user_attr("estimator", estimator) raise ex diff --git a/atom/models/custom.py b/atom/models/custom.py index 85bd79b72..4697f1d7f 100644 --- a/atom/models/custom.py +++ b/atom/models/custom.py @@ -41,10 +41,11 @@ def __init__(self, **kwargs): f"the model's acronym." ) + self.handles_missing = getattr(est, "handles_missing", False) self.needs_scaling = getattr(est, "needs_scaling", False) self.native_multilabel = getattr(est, "native_multilabel", False) self.native_multioutput = getattr(est, "native_multioutput", False) - self.has_validation = getattr(est, "has_validation", None) + self.validation = getattr(est, "validation", None) super().__init__(name=name, **kwargs) diff --git a/atom/models/ensembles.py b/atom/models/ensembles.py index 5c8e023dc..29224d57b 100644 --- a/atom/models/ensembles.py +++ b/atom/models/ensembles.py @@ -28,8 +28,9 @@ class Stacking(ClassRegModel): """ acronym = "Stack" + handles_missing = False needs_scaling = False - has_validation = None + validation = None native_multilabel = False native_multioutput = False supports_engines = () @@ -83,8 +84,9 @@ class Voting(ClassRegModel): """ acronym = "Vote" + handles_missing = False needs_scaling = False - has_validation = None + validation = None native_multilabel = False native_multioutput = False supports_engines = () diff --git a/atom/models/ts.py b/atom/models/ts.py index 2e732e7ab..b10fe99cc 100644 --- a/atom/models/ts.py +++ b/atom/models/ts.py @@ -11,10 +11,12 @@ from optuna.distributions import BaseDistribution from optuna.distributions import CategoricalDistribution as Cat +from optuna.distributions import FloatDistribution as Float from optuna.distributions import IntDistribution as Int from optuna.trial import Trial from atom.basemodel import ForecastModel +from atom.utils.types import Predictor class ARIMA(ForecastModel): @@ -23,7 +25,7 @@ class ARIMA(ForecastModel): Seasonal ARIMA models and exogenous input is supported, hence this estimator is capable of fitting SARIMA, ARIMAX, and SARIMAX. - An ARIMA model, is a generalization of an autoregressive moving + An ARIMA model is a generalization of an autoregressive moving average (ARMA) model, and is fitted to time-series data in an effort to forecast future points. ARIMA models can be especially efficacious in cases where data shows evidence of non-stationarity. @@ -72,18 +74,17 @@ class ARIMA(ForecastModel): """ acronym = "ARIMA" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None + handles_missing = True + uses_exogenous = True + in_sample_prediction = True + native_multivariate = False supports_engines = ("sktime",) _module = "sktime.forecasting.arima" _estimators: ClassVar[dict[str, str]] = {"forecast": "ARIMA"} _order = ("p", "d", "q") - _sorder = ("P", "D", "Q", "S") + _seasonal_order = ("P", "D", "Q", "S") def _get_parameters(self, trial: Trial) -> dict[str, BaseDistribution]: """Get the trial's hyperparameters. @@ -103,7 +104,7 @@ def _get_parameters(self, trial: Trial) -> dict[str, BaseDistribution]: # If no seasonal periodicity, set seasonal components to zero if self._get_param("S", params) == 0: - for p in self._sorder: + for p in self._seasonal_order: if p in params: params[p] = 0 @@ -128,8 +129,8 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]: # Convert params to hyperparameters 'order' and 'seasonal_order' if all(p in params for p in self._order): params["order"] = tuple(params.pop(p) for p in self._order) - if all(p in params for p in self._sorder): - params["seasonal_order"] = tuple(params.pop(p) for p in self._sorder) + if all(p in params for p in self._seasonal_order): + params["seasonal_order"] = tuple(params.pop(p) for p in self._seasonal_order) return params @@ -162,7 +163,7 @@ def _get_distributions(self) -> dict[str, BaseDistribution]: for p in self._order: dist.pop(p) if "seasonal_order" in self._est_params: - for p in self._sorder: + for p in self._seasonal_order: dist.pop(p) return dist @@ -214,11 +215,10 @@ class AutoARIMA(ForecastModel): """ acronym = "AutoARIMA" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None + handles_missing = True + uses_exogenous = True + in_sample_prediction = True + native_multivariate = False supports_engines = ("sktime",) _module = "sktime.forecasting.arima" @@ -243,12 +243,161 @@ def _get_distributions() -> dict[str, BaseDistribution]: } -class ExponentialSmoothing(ForecastModel): - """Exponential Smoothing forecaster. +class BATS(ForecastModel): + """BATS forecaster with multiple seasonality. + + BATS is acronym for: + + - Box-Cox transformation + - ARMA errors + - Trend + - Seasonal components - Holt-Winters exponential smoothing forecaster. The default settings - use simple exponential smoothing, without trend and seasonality - components. + BATS was designed to forecast time series with multiple seasonal + periods. For example, daily data may have a weekly pattern as well + as an annual pattern. Or hourly data can have three seasonal periods: + a daily pattern, a weekly pattern, and an annual pattern. + + In BATS, a [Box-Cox transformation][boxcox] is applied to the + original time series, and then this is modeled as a linear + combination of an exponentially smoothed trend, a seasonal + component and an ARMA component. BATS conducts some hyperparameter + tuning (e.g., which of these components to keep and which to discard) + using AIC. + + Corresponding estimators are: + + - [BATS][batsclass] for forecasting tasks. + + See Also + -------- + atom.models:ARIMA + atom.models:AutoARIMA + atom.models:TBATS + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.run(models="BATS", verbose=2) + ``` + + """ + + acronym = "BATS" + handles_missing = False + uses_exogenous = False + in_sample_prediction = True + native_multivariate = False + supports_engines = ("sktime",) + + _module = "sktime.forecasting.bats" + _estimators: ClassVar[dict[str, str]] = {"forecast": "BATS"} + + def _get_est(self, params: dict[str, Any]) -> Predictor: + """Get the model's estimator with unpacked parameters. + + Parameters + ---------- + params: dict + Hyperparameters for the estimator. + + Returns + ------- + Predictor + Estimator instance. + + """ + return self._est_class( + show_warnings=params.pop("show_warnings", self.warnings in ("always", "default")), + n_jobs=params.pop("n_jobs", self.n_jobs), + **params, + ) + + @staticmethod + def _get_distributions() -> dict[str, BaseDistribution]: + """Get the predefined hyperparameter distributions. + + Returns + ------- + dict + Hyperparameter distributions. + + """ + return { + "use_box_cox": Cat([True, False, None]), + "use_trend": Cat([True, False, None]), + "use_damped_trend": Cat([True, False, None]), + "use_arma_errors": Cat([True, False]), + } + + +class Croston(ForecastModel): + """Croston's method for forecasting. + + Croston's method is a modification of (vanilla) exponential + smoothing to handle intermittent time series. A time series is + considered intermittent if many of its values are zero and the + gaps between non-zero entries are not periodic. + + Croston's method will predict a constant value for all future + times, so Croston's method essentially provides another notion + for the average value of a time series. + + Corresponding estimators are: + + - [Croston][crostonclass] for forecasting tasks. + + See Also + -------- + atom.models:ExponentialSmoothing + atom.models:ETS + atom.models:NaiveForecaster + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.run(models="Croston", verbose=2) + ``` + + """ + + acronym = "Croston" + handles_missing = False + uses_exogenous = True + in_sample_prediction = True + native_multivariate = False + supports_engines = ("sktime",) + + _module = "sktime.forecasting.croston" + _estimators: ClassVar[dict[str, str]] = {"forecast": "Croston"} + + @staticmethod + def _get_distributions() -> dict[str, BaseDistribution]: + """Get the predefined hyperparameter distributions. + + Returns + ------- + dict + Hyperparameter distributions. + + """ + return {"smoothing": Float(0, 1, step=0.1)} + + +class ExponentialSmoothing(ForecastModel): + """Holt-Winters Exponential Smoothing forecaster. Corresponding estimators are: @@ -275,16 +424,36 @@ class ExponentialSmoothing(ForecastModel): """ acronym = "ES" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None + handles_missing = False + uses_exogenous = False + in_sample_prediction = True + native_multivariate = False supports_engines = ("sktime",) _module = "sktime.forecasting.exp_smoothing" _estimators: ClassVar[dict[str, str]] = {"forecast": "ExponentialSmoothing"} + def _get_parameters(self, trial: Trial) -> dict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + dict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if not self._get_param("trend", params) and "damped_trend" in params: + params["damped_trend"] = False + + return params + @staticmethod def _get_distributions() -> dict[str, BaseDistribution]: """Get the predefined hyperparameter distributions. @@ -295,8 +464,6 @@ def _get_distributions() -> dict[str, BaseDistribution]: Hyperparameter distributions. """ - methods = ["L-BFGS-B", "TNC", "SLSQP", "Powell", "trust-constr", "bh", "ls"] - return { "trend": Cat(["add", "mul", None]), "damped_trend": Cat([True, False]), @@ -304,7 +471,8 @@ def _get_distributions() -> dict[str, BaseDistribution]: "sp": Cat([4, 6, 7, 12, None]), "use_boxcox": Cat([True, False]), "initialization_method": Cat(["estimated", "heuristic"]), - "method": Cat(methods), + "method": Cat(["L-BFGS-B", "TNC", "SLSQP", "Powell", "trust-constr", "bh", "ls"]), + "use_brute": Cat([True, False]), } @@ -342,16 +510,36 @@ class ETS(ForecastModel): """ acronym = "ETS" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None + handles_missing = True + uses_exogenous = False + in_sample_prediction = True + native_multivariate = False supports_engines = ("sktime",) _module = "sktime.forecasting.ets" _estimators: ClassVar[dict[str, str]] = {"forecast": "AutoETS"} + def _get_parameters(self, trial: Trial) -> dict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + dict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if not self._get_param("trend", params) and "damped_trend" in params: + params["damped_trend"] = False + + return params + @staticmethod def _get_distributions() -> dict[str, BaseDistribution]: """Get the predefined hyperparameter distributions. @@ -372,6 +560,10 @@ def _get_distributions() -> dict[str, BaseDistribution]: "maxiter": Int(500, 2000, step=100), "auto": Cat([True, False]), "information_criterion": Cat(["aic", "bic", "aicc"]), + "allow_multiplicative_trend": Cat([True, False]), + "restrict": Cat([True, False]), + "additive_only": Cat([True, False]), + "ignore_inf_ic": Cat([True, False]), } @@ -409,11 +601,10 @@ class NaiveForecaster(ForecastModel): """ acronym = "NF" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None + handles_missing = True + uses_exogenous = False + in_sample_prediction = True + native_multivariate = False supports_engines = ("sktime",) _module = "sktime.forecasting.naive" @@ -464,11 +655,10 @@ class PolynomialTrend(ForecastModel): """ acronym = "PT" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None + handles_missing = False + uses_exogenous = False + in_sample_prediction = True + native_multivariate = False supports_engines = ("sktime",) _module = "sktime.forecasting.trend" @@ -488,3 +678,221 @@ def _get_distributions() -> dict[str, BaseDistribution]: "degree": Int(1, 5), "with_intercept": Cat([True, False]), } + + +class STL(ForecastModel): + """Seasonal-Trend decomposition using Loess. + + STL is a technique commonly used for decomposing time series data + into components like trend, seasonality, and residuals. + + Corresponding estimators are: + + - [STLForecaster][] for forecasting tasks. + + See Also + -------- + atom.models:Croston + atom.models:ETS + atom.models:Theta + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.run(models="STL", verbose=2) + ``` + + """ + + acronym = "STL" + handles_missing = False + uses_exogenous = False + in_sample_prediction = True + native_multivariate = False + supports_engines = ("sktime",) + + _module = "sktime.forecasting.trend" + _estimators: ClassVar[dict[str, str]] = {"forecast": "STLForecaster"} + + @staticmethod + def _get_distributions() -> dict[str, BaseDistribution]: + """Get the predefined hyperparameter distributions. + + Returns + ------- + dict + Hyperparameter distributions. + + """ + return { + "seasonal": Int(3, 11, step=2), + "seasonal_deg": Cat([0, 1]), + "trend_deg": Cat([0, 1]), + "low_pass_deg": Cat([0, 1]), + "robust": Cat([True, False]), + } + + +class TBATS(ForecastModel): + """TBATS forecaster with multiple seasonality. + + TBATS is acronym for: + + - Trigonometric seasonality + - Box-Cox transformation + - ARMA errors + - Trend + - Seasonal components + + TBATS was designed to forecast time series with multiple seasonal + periods. For example, daily data may have a weekly pattern as well + as an annual pattern. Or hourly data can have three seasonal periods: + a daily pattern, a weekly pattern, and an annual pattern. + + In BATS, a [Box-Cox transformation][boxcox] is applied to the + original time series, and then this is modeled as a linear + combination of an exponentially smoothed trend, a seasonal + component and an ARMA component. The seasonal components are + modeled by trigonometric functions via Fourier series. TBATS + conducts some hyper-parameter tuning (e.g. which of these + components to keep and which to discard) using AIC. + + Corresponding estimators are: + + - [TBATS][tbatsclass] for forecasting tasks. + + See Also + -------- + atom.models:BATS + atom.models:ARIMA + atom.models:AutoARIMA + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.run(models="TBATS", verbose=2) + ``` + + """ + + acronym = "TBATS" + handles_missing = False + uses_exogenous = False + in_sample_prediction = True + native_multivariate = False + supports_engines = ("sktime",) + + _module = "sktime.forecasting.tbats" + _estimators: ClassVar[dict[str, str]] = {"forecast": "TBATS"} + + def _get_est(self, params: dict[str, Any]) -> Predictor: + """Get the model's estimator with unpacked parameters. + + Parameters + ---------- + params: dict + Hyperparameters for the estimator. + + Returns + ------- + Predictor + Estimator instance. + + """ + return self._est_class( + show_warnings=params.pop("show_warnings", self.warnings in ("always", "default")), + n_jobs=params.pop("n_jobs", self.n_jobs), + **params, + ) + + @staticmethod + def _get_distributions() -> dict[str, BaseDistribution]: + """Get the predefined hyperparameter distributions. + + Returns + ------- + dict + Hyperparameter distributions. + + """ + return { + "use_box_cox": Cat([True, False, None]), + "use_trend": Cat([True, False, None]), + "use_damped_trend": Cat([True, False, None]), + "use_arma_errors": Cat([True, False]), + } + + +class Theta(ForecastModel): + """Theta method for forecasting. + + The theta method is equivalent to simple [ExponentialSmoothing][] + with drift. The series is tested for seasonality, and, if deemed + seasonal, the series is seasonally adjusted using a classical + multiplicative decomposition before applying the theta method. The + resulting forecasts are then reseasonalised. + + In cases where ES results in a constant forecast, the theta + forecaster will revert to predicting the SES constant plus a linear + trend derived from the training data. + + Prediction intervals are computed using the underlying state space + model. + + Corresponding estimators are: + + - [ThetaForecaster][] for forecasting tasks. + + See Also + -------- + atom.models:Croston + atom.models:ExponentialSmoothing + atom.models:PolynomialTrend + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.run(models="Theta", verbose=2) + ``` + + """ + + acronym = "Theta" + handles_missing = False + uses_exogenous = False + in_sample_prediction = True + native_multivariate = False + supports_engines = ("sktime",) + + _module = "sktime.forecasting.theta" + _estimators: ClassVar[dict[str, str]] = {"forecast": "ThetaForecaster"} + + @staticmethod + def _get_distributions() -> dict[str, BaseDistribution]: + """Get the predefined hyperparameter distributions. + + Returns + ------- + dict + Hyperparameter distributions. + + """ + return {"deseasonalize": Cat([False, True])} diff --git a/atom/nlp.py b/atom/nlp.py index 28ab2e61f..418b6279c 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -374,10 +374,10 @@ class TextNormalizer(TransformerMixin): Attributes ---------- feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -606,10 +606,10 @@ class Tokenizer(TransformerMixin): Created quadgrams and their frequencies. feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also -------- @@ -808,21 +808,22 @@ class Vectorizer(TransformerMixin): `#!python device="gpu"` to use the GPU. Read more in the [user guide][gpu-acceleration]. - engine: dict, default={"data": "numpy", "estimator": "sklearn"} + engine: dict or None, default=None Execution engine to use for [data][data-acceleration] and [estimators][estimator-acceleration]. The value should be a dictionary with keys `data` and/or `estimator`, with their - corresponding choice as values. Choose from: + corresponding choice as values. If None, the default values + are used.Choose from: - "data": - - "numpy" + - "numpy" (default) - "pyarrow" - "modin" - "estimator": - - "sklearn" + - "sklearn" (default) - "cuml" verbose: int, default=0 @@ -847,10 +848,10 @@ class Vectorizer(TransformerMixin): corpus, e.g., `vectorizer.tfidf` for the tfidf strategy. feature_names_in_: np.ndarray - Names of features seen during fit. + Names of features seen during `fit`. n_features_in_: int - Number of features seen during fit. + Number of features seen during `fit`. See Also diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py index e221e0eaf..a05983456 100644 --- a/atom/plots/dataplot.py +++ b/atom/plots/dataplot.py @@ -652,7 +652,7 @@ def get_text(column: Series) -> Series: fig.add_trace( go.Bar( - x=(data := series[-self._get_show(show, len(series)) :]), + x=(data := series[-self._get_show(show, len(series)):]), y=data.index, orientation="h", marker={ diff --git a/atom/utils/types.py b/atom/utils/types.py index 4b0a95d16..a606fb7c1 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -159,14 +159,6 @@ def predict(self, *args, **kwargs) -> Pandas: ... Scalar: TypeAlias = Int | Float Segment: TypeAlias = range | slice Index: TypeAlias = pd.Index | md.Index -TSIndex: TypeAlias = ( - pd.PeriodIndex - | md.PeriodIndex - | pd.DatetimeIndex - | md.DatetimeIndex - | pd.TimedeltaIndex - | md.TimedeltaIndex -) Series: TypeAlias = pd.Series | md.Series DataFrame: TypeAlias = pd.DataFrame | md.DataFrame Pandas: TypeAlias = Series | DataFrame @@ -262,6 +254,7 @@ def predict(self, *args, **kwargs) -> Pandas: ... | dict[str, IntLargerEqualZero] | Sequence[IntLargerEqualZero] ) +HarmonicsSelector: TypeAlias = Literal["drop", "raw_strength", "harmonic_strength"] # Allowed values for method selection PredictionMethods: TypeAlias = Literal[ @@ -310,7 +303,6 @@ def predict(self, *args, **kwargs) -> Pandas: ... float_t = (float, np.floating) segment_t = (slice, range) index_t = (pd.Index, md.Index) -tsindex_t = TSIndex.__args__ series_t = (pd.Series, md.Series) sequence_t = (range, list, tuple, np.ndarray, *index_t, *series_t) dataframe_t = (pd.DataFrame, md.DataFrame) diff --git a/atom/utils/utils.py b/atom/utils/utils.py index bbdbbc4d4..c2713de39 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -16,7 +16,7 @@ from contextlib import contextmanager from copy import copy from dataclasses import dataclass -from enum import Enum +from enum import Enum, IntEnum from functools import cached_property, wraps from importlib import import_module from importlib.util import find_spec @@ -178,6 +178,29 @@ def is_multioutput(self) -> bool: return self.value in (2, 3, 5, 7) +class SeasonalPeriod(IntEnum): + """Seasonal periodicity. + + Covers pandas' aliases for periods. + See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#period-aliases + + """ + + B = 5 # business day + D = 7 # calendar day + W = 52 # week + M = 12 # month + Q = 4 # quarter + A = 1 # year + Y = 1 # year + H = 24 # hours + T = 60 # minutes + S = 60 # seconds + L = 1e3 # milliseconds + U = 1e6 # microseconds + N = 1e9 # nanoseconds + + @dataclass class DataContainer: """Stores a branch's data.""" @@ -222,6 +245,7 @@ class DataConfig: index: IndexSelector = True ignore: tuple[str, ...] = () + sp: int | list[int] | None = None shuffle: Bool = False stratify: IndexSelector = True n_rows: Scalar = 1 diff --git a/docs_sources/api/models/adab.md b/docs_sources/api/models/adab.md index aac1b7ca9..ceeefc253 100644 --- a/docs_sources/api/models/adab.md +++ b/docs_sources/api/models/adab.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/ard.md b/docs_sources/api/models/ard.md index 5c1ae30c5..b2b7846a8 100644 --- a/docs_sources/api/models/ard.md +++ b/docs_sources/api/models/ard.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/arima.md b/docs_sources/api/models/arima.md index bd7d0a355..323bb3cce 100644 --- a/docs_sources/api/models/arima.md +++ b/docs_sources/api/models/arima.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/autoarima.md b/docs_sources/api/models/autoarima.md index 6a2cc5d3a..31ae376db 100644 --- a/docs_sources/api/models/autoarima.md +++ b/docs_sources/api/models/autoarima.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/bag.md b/docs_sources/api/models/bag.md index 02820e833..03713c540 100644 --- a/docs_sources/api/models/bag.md +++ b/docs_sources/api/models/bag.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/bats.md b/docs_sources/api/models/bats.md new file mode 100644 index 000000000..15ce1fed2 --- /dev/null +++ b/docs_sources/api/models/bats.md @@ -0,0 +1,80 @@ +# BATS +------ + +:: atom.models:BATS + :: tags + :: description + :: see also + +
+ +## Example + +:: examples + +

+ +## Hyperparameters + +:: hyperparameters + +

+ +## Attributes + +### Data attributes + +:: table: + - attributes: + from_docstring: False + include: + - pipeline + - atom.branch:Branch.mapping + - dataset + - train + - test + - X + - y + - X_train + - y_train + - X_test + - atom.branch:Branch.y_test + - X_holdout + - y_holdout + - shape + - columns + - n_columns + - features + - n_features + - atom.branch:Branch.target + +
+ +### Utility attributes + +:: table: + - attributes: + from_docstring: False + include: + - name + - run + - study + - trials + - best_trial + - best_params + - estimator + - bootstrap + - results + - feature_importance + +

+ +## Methods + +The [plots][available-plots] can be called directly from the model. +The remaining utility methods can be found hereunder. + +:: methods: + toc_only: False + exclude: + - plot_.* diff --git a/docs_sources/api/models/bnb.md b/docs_sources/api/models/bnb.md index ddad46cef..0082f8ea0 100644 --- a/docs_sources/api/models/bnb.md +++ b/docs_sources/api/models/bnb.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/br.md b/docs_sources/api/models/br.md index e14cf12e0..780b024f8 100644 --- a/docs_sources/api/models/br.md +++ b/docs_sources/api/models/br.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/catb.md b/docs_sources/api/models/catb.md index 1ffc2d37a..c230c61e2 100644 --- a/docs_sources/api/models/catb.md +++ b/docs_sources/api/models/catb.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/catnb.md b/docs_sources/api/models/catnb.md index 421703f2a..f00783e7b 100644 --- a/docs_sources/api/models/catnb.md +++ b/docs_sources/api/models/catnb.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/cnb.md b/docs_sources/api/models/cnb.md index 20a45e626..33cb1ca65 100644 --- a/docs_sources/api/models/cnb.md +++ b/docs_sources/api/models/cnb.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/croston.md b/docs_sources/api/models/croston.md new file mode 100644 index 000000000..01afd1f9b --- /dev/null +++ b/docs_sources/api/models/croston.md @@ -0,0 +1,80 @@ +# Croston +--------- + +:: atom.models:Croston + :: tags + :: description + :: see also + +
+ +## Example + +:: examples + +

+ +## Hyperparameters + +:: hyperparameters + +

+ +## Attributes + +### Data attributes + +:: table: + - attributes: + from_docstring: False + include: + - pipeline + - atom.branch:Branch.mapping + - dataset + - train + - test + - X + - y + - X_train + - y_train + - X_test + - atom.branch:Branch.y_test + - X_holdout + - y_holdout + - shape + - columns + - n_columns + - features + - n_features + - atom.branch:Branch.target + +
+ +### Utility attributes + +:: table: + - attributes: + from_docstring: False + include: + - name + - run + - study + - trials + - best_trial + - best_params + - estimator + - bootstrap + - results + - feature_importance + +

+ +## Methods + +The [plots][available-plots] can be called directly from the model. +The remaining utility methods can be found hereunder. + +:: methods: + toc_only: False + exclude: + - plot_.* diff --git a/docs_sources/api/models/dummy.md b/docs_sources/api/models/dummy.md index d0204e44e..d190ad628 100644 --- a/docs_sources/api/models/dummy.md +++ b/docs_sources/api/models/dummy.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/en.md b/docs_sources/api/models/en.md index aecaa94e5..efe731da3 100644 --- a/docs_sources/api/models/en.md +++ b/docs_sources/api/models/en.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/es.md b/docs_sources/api/models/es.md index 3839dce5c..73fa78a15 100644 --- a/docs_sources/api/models/es.md +++ b/docs_sources/api/models/es.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/et.md b/docs_sources/api/models/et.md index 8ecb0dd85..4528143f5 100644 --- a/docs_sources/api/models/et.md +++ b/docs_sources/api/models/et.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/etree.md b/docs_sources/api/models/etree.md index 0b99c0ea3..66b432072 100644 --- a/docs_sources/api/models/etree.md +++ b/docs_sources/api/models/etree.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/ets.md b/docs_sources/api/models/ets.md index b918f5f0c..9da10b282 100644 --- a/docs_sources/api/models/ets.md +++ b/docs_sources/api/models/ets.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/gbm.md b/docs_sources/api/models/gbm.md index 8d7de985b..0e82b1d69 100644 --- a/docs_sources/api/models/gbm.md +++ b/docs_sources/api/models/gbm.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/gnb.md b/docs_sources/api/models/gnb.md index 9602f5667..3c30521a9 100644 --- a/docs_sources/api/models/gnb.md +++ b/docs_sources/api/models/gnb.md @@ -35,11 +35,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/gp.md b/docs_sources/api/models/gp.md index 8d25d5fe1..3deec170e 100644 --- a/docs_sources/api/models/gp.md +++ b/docs_sources/api/models/gp.md @@ -35,11 +35,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/hgbm.md b/docs_sources/api/models/hgbm.md index 6e62c56b6..629b487cd 100644 --- a/docs_sources/api/models/hgbm.md +++ b/docs_sources/api/models/hgbm.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/huber.md b/docs_sources/api/models/huber.md index eef82d673..08591ce80 100644 --- a/docs_sources/api/models/huber.md +++ b/docs_sources/api/models/huber.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/knn.md b/docs_sources/api/models/knn.md index ec9a370b9..9f6e2cd41 100644 --- a/docs_sources/api/models/knn.md +++ b/docs_sources/api/models/knn.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/lars.md b/docs_sources/api/models/lars.md index f47b0f5f8..fa75faaa0 100644 --- a/docs_sources/api/models/lars.md +++ b/docs_sources/api/models/lars.md @@ -35,11 +35,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/lasso.md b/docs_sources/api/models/lasso.md index 951d50960..61c2811f5 100644 --- a/docs_sources/api/models/lasso.md +++ b/docs_sources/api/models/lasso.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/lda.md b/docs_sources/api/models/lda.md index adebdf66d..72e7b093f 100644 --- a/docs_sources/api/models/lda.md +++ b/docs_sources/api/models/lda.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/lgb.md b/docs_sources/api/models/lgb.md index 1d3f93fb5..601cebcdf 100644 --- a/docs_sources/api/models/lgb.md +++ b/docs_sources/api/models/lgb.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/lr.md b/docs_sources/api/models/lr.md index aae69d158..8dfd23c47 100644 --- a/docs_sources/api/models/lr.md +++ b/docs_sources/api/models/lr.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/lsvm.md b/docs_sources/api/models/lsvm.md index 27552d3d6..5f3861385 100644 --- a/docs_sources/api/models/lsvm.md +++ b/docs_sources/api/models/lsvm.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/mlp.md b/docs_sources/api/models/mlp.md index 433205d05..4dc28b52e 100644 --- a/docs_sources/api/models/mlp.md +++ b/docs_sources/api/models/mlp.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/mnb.md b/docs_sources/api/models/mnb.md index 6d9d56c46..fdc01fa21 100644 --- a/docs_sources/api/models/mnb.md +++ b/docs_sources/api/models/mnb.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/nf.md b/docs_sources/api/models/nf.md index c4bfb45d9..12c8ed8a6 100644 --- a/docs_sources/api/models/nf.md +++ b/docs_sources/api/models/nf.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/ols.md b/docs_sources/api/models/ols.md index 03e7f8a34..b5089aa8e 100644 --- a/docs_sources/api/models/ols.md +++ b/docs_sources/api/models/ols.md @@ -35,11 +35,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/omp.md b/docs_sources/api/models/omp.md index 00c396dfb..bc040d7f0 100644 --- a/docs_sources/api/models/omp.md +++ b/docs_sources/api/models/omp.md @@ -35,11 +35,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/pa.md b/docs_sources/api/models/pa.md index a5391c3b1..3115976ff 100644 --- a/docs_sources/api/models/pa.md +++ b/docs_sources/api/models/pa.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/perc.md b/docs_sources/api/models/perc.md index ea705bc68..673a89eed 100644 --- a/docs_sources/api/models/perc.md +++ b/docs_sources/api/models/perc.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/pt.md b/docs_sources/api/models/pt.md index 4cdbe2d53..4f96e83a4 100644 --- a/docs_sources/api/models/pt.md +++ b/docs_sources/api/models/pt.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/qda.md b/docs_sources/api/models/qda.md index d6a0c98ea..01e37edc8 100644 --- a/docs_sources/api/models/qda.md +++ b/docs_sources/api/models/qda.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/rf.md b/docs_sources/api/models/rf.md index ecd381f80..3372776b6 100644 --- a/docs_sources/api/models/rf.md +++ b/docs_sources/api/models/rf.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/ridge.md b/docs_sources/api/models/ridge.md index f856522c1..d605b7b40 100644 --- a/docs_sources/api/models/ridge.md +++ b/docs_sources/api/models/ridge.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/rnn.md b/docs_sources/api/models/rnn.md index d0a41a0eb..5c13d45cb 100644 --- a/docs_sources/api/models/rnn.md +++ b/docs_sources/api/models/rnn.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/sgd.md b/docs_sources/api/models/sgd.md index 317c28060..28a514681 100644 --- a/docs_sources/api/models/sgd.md +++ b/docs_sources/api/models/sgd.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/stl.md b/docs_sources/api/models/stl.md new file mode 100644 index 000000000..82477bbcd --- /dev/null +++ b/docs_sources/api/models/stl.md @@ -0,0 +1,80 @@ +# STL +----- + +:: atom.models:STL + :: tags + :: description + :: see also + +
+ +## Example + +:: examples + +

+ +## Hyperparameters + +:: hyperparameters + +

+ +## Attributes + +### Data attributes + +:: table: + - attributes: + from_docstring: False + include: + - pipeline + - atom.branch:Branch.mapping + - dataset + - train + - test + - X + - y + - X_train + - y_train + - X_test + - atom.branch:Branch.y_test + - X_holdout + - y_holdout + - shape + - columns + - n_columns + - features + - n_features + - atom.branch:Branch.target + +
+ +### Utility attributes + +:: table: + - attributes: + from_docstring: False + include: + - name + - run + - study + - trials + - best_trial + - best_params + - estimator + - bootstrap + - results + - feature_importance + +

+ +## Methods + +The [plots][available-plots] can be called directly from the model. +The remaining utility methods can be found hereunder. + +:: methods: + toc_only: False + exclude: + - plot_.* diff --git a/docs_sources/api/models/svm.md b/docs_sources/api/models/svm.md index 6db11ab25..e34159225 100644 --- a/docs_sources/api/models/svm.md +++ b/docs_sources/api/models/svm.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/tbats.md b/docs_sources/api/models/tbats.md new file mode 100644 index 000000000..b5759ca48 --- /dev/null +++ b/docs_sources/api/models/tbats.md @@ -0,0 +1,80 @@ +# TBATS +------- + +:: atom.models:TBATS + :: tags + :: description + :: see also + +
+ +## Example + +:: examples + +

+ +## Hyperparameters + +:: hyperparameters + +

+ +## Attributes + +### Data attributes + +:: table: + - attributes: + from_docstring: False + include: + - pipeline + - atom.branch:Branch.mapping + - dataset + - train + - test + - X + - y + - X_train + - y_train + - X_test + - atom.branch:Branch.y_test + - X_holdout + - y_holdout + - shape + - columns + - n_columns + - features + - n_features + - atom.branch:Branch.target + +
+ +### Utility attributes + +:: table: + - attributes: + from_docstring: False + include: + - name + - run + - study + - trials + - best_trial + - best_params + - estimator + - bootstrap + - results + - feature_importance + +

+ +## Methods + +The [plots][available-plots] can be called directly from the model. +The remaining utility methods can be found hereunder. + +:: methods: + toc_only: False + exclude: + - plot_.* diff --git a/docs_sources/api/models/theta.md b/docs_sources/api/models/theta.md new file mode 100644 index 000000000..906667ec2 --- /dev/null +++ b/docs_sources/api/models/theta.md @@ -0,0 +1,80 @@ +# Theta +------- + +:: atom.models:Theta + :: tags + :: description + :: see also + +
+ +## Example + +:: examples + +

+ +## Hyperparameters + +:: hyperparameters + +

+ +## Attributes + +### Data attributes + +:: table: + - attributes: + from_docstring: False + include: + - pipeline + - atom.branch:Branch.mapping + - dataset + - train + - test + - X + - y + - X_train + - y_train + - X_test + - atom.branch:Branch.y_test + - X_holdout + - y_holdout + - shape + - columns + - n_columns + - features + - n_features + - atom.branch:Branch.target + +
+ +### Utility attributes + +:: table: + - attributes: + from_docstring: False + include: + - name + - run + - study + - trials + - best_trial + - best_params + - estimator + - bootstrap + - results + - feature_importance + +

+ +## Methods + +The [plots][available-plots] can be called directly from the model. +The remaining utility methods can be found hereunder. + +:: methods: + toc_only: False + exclude: + - plot_.* diff --git a/docs_sources/api/models/tree.md b/docs_sources/api/models/tree.md index 730ed02f4..a59618bd8 100644 --- a/docs_sources/api/models/tree.md +++ b/docs_sources/api/models/tree.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/api/models/xgb.md b/docs_sources/api/models/xgb.md index 0021f5c82..532b077b3 100644 --- a/docs_sources/api/models/xgb.md +++ b/docs_sources/api/models/xgb.md @@ -41,11 +41,11 @@ - atom.branch:Branch.y_test - X_holdout - y_holdout - - atom.branch:Branch.shape - - atom.branch:Branch.columns - - atom.branch:Branch.n_columns - - atom.branch:Branch.features - - atom.branch:Branch.n_features + - shape + - columns + - n_columns + - features + - n_features - atom.branch:Branch.target
diff --git a/docs_sources/contributing.md b/docs_sources/contributing.md index 6894372fe..108de5968 100644 --- a/docs_sources/contributing.md +++ b/docs_sources/contributing.md @@ -91,7 +91,7 @@ and accept your changes. * Update the documentation so all of your changes are reflected there. * Adhere to [PEP 8](https://peps.python.org/pep-0008/) standards. * Use a maximum of 99 characters per line. Try to keep docstrings below - 74 characters. + 80 characters. * Update the project unit tests to test your code changes as thoroughly as possible. * Make sure that your code is properly commented with docstrings and diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md index fde131cb7..2a4b46020 100644 --- a/docs_sources/dependencies.md +++ b/docs_sources/dependencies.md @@ -66,6 +66,7 @@ additional libraries. You can install all the optional dependencies using * **[pmdarima](http://alkaline-ml.com/pmdarima/)** (>=2.0.3) * **[schemdraw](https://schemdraw.readthedocs.io/en/latest/index.html)** (>=0.16) * **[sweetviz](https://github.com/fbdesignpro/sweetviz)** (>=2.3.1) +* **[tbats](https://github.com/intive-DataScience/tbats)** (>=1.1.3) * **[wordcloud](http://amueller.github.io/word_cloud/)** (>=1.9.2) * **[xgboost](https://xgboost.readthedocs.io/en/latest/)** (>=2.0.0) @@ -75,7 +76,7 @@ additional libraries. You can install all the optional dependencies using The development dependencies are not installed with the package, and are not required for any of its functionalities. These libraries are only necessary to [contribute][contributing] to the project. Install them -running `pdm install --dev` (remember to install [pdm](https://pdm-project.org/latest/) with +running `pdm install --dev` (remember to install [pdm](https://pdm-project.org/latest/) first with `pip install -U pdm`). **Linting** diff --git a/docs_sources/examples/deep_learning.ipynb b/docs_sources/examples/deep_learning.ipynb index 03735284c..ed7302b47 100644 --- a/docs_sources/examples/deep_learning.ipynb +++ b/docs_sources/examples/deep_learning.ipynb @@ -113,7 +113,7 @@ " estimator=ConvNN(verbose=0),\n", " acronym=\"CNN\",\n", " needs_scaling=True, # Applies automated feature scaling before fitting\n", - " has_validation=\"epochs\", # Applies in-training validation on parameter epochs\n", + " validation=\"epochs\", # Applies in-training validation on parameter epochs\n", ")" ] }, diff --git a/docs_sources/examples/in_training_validation.ipynb b/docs_sources/examples/in_training_validation.ipynb index 31c5c31cf..394f11407 100644 --- a/docs_sources/examples/in_training_validation.ipynb +++ b/docs_sources/examples/in_training_validation.ipynb @@ -106,7 +106,7 @@ " \n", " acronym\n", " model\n", - " has_validation\n", + " validation\n", " \n", " \n", " \n", @@ -157,7 +157,7 @@ "" ], "text/plain": [ - " acronym model has_validation\n", + " acronym model validation\n", "3 CatB CatBoost True\n", "15 LGB LightGBM True\n", "19 MLP MultiLayerPerceptron True\n", @@ -175,8 +175,8 @@ "source": [ "# Not all models support in-training validation\n", "# You can chek which ones do using the available_models method\n", - "df = atom.available_models()[[\"acronym\", \"model\", \"has_validation\"]]\n", - "df[df[\"has_validation\"]]" + "df = atom.available_models()[[\"acronym\", \"model\", \"validation\"]]\n", + "df[df[\"validation\"]]" ] }, { diff --git a/docs_sources/examples/multioutput_regression.ipynb b/docs_sources/examples/multioutput_regression.ipynb index ec37c7b37..9b089e42d 100644 --- a/docs_sources/examples/multioutput_regression.ipynb +++ b/docs_sources/examples/multioutput_regression.ipynb @@ -515,7 +515,7 @@ "\tneeds_scaling=True\n", "\tnative_multioutput=True\n", "\tnative_multilabel=False\n", - "\thas_validation=None\n", + "\tvalidation=None\n", ")\n" ] } diff --git a/docs_sources/scripts/autodocs.py b/docs_sources/scripts/autodocs.py index b9ed05cad..54fa85024 100644 --- a/docs_sources/scripts/autodocs.py +++ b/docs_sources/scripts/autodocs.py @@ -43,6 +43,7 @@ joblibmemory="https://joblib.readthedocs.io/en/latest/generated/joblib.Memory.html", warnings="https://docs.python.org/3/library/warnings.html#the-warnings-filter", datetimeindex="https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.html", + periodalias="https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#period-aliases", # ATOM rangeindex="https://pandas.pydata.org/docs/reference/api/pandas.RangeIndex.html", experiment="https://www.mlflow.org/docs/latest/tracking.html#organizing-runs-in-experiments", @@ -130,6 +131,7 @@ baggingclassifier="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html", baggingregressor="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html", bagdocs="https://scikit-learn.org/stable/modules/ensemble.html#bootstrapping", + batsclass="https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.bats.BATS.html", bayesianridgeclass="https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.BayesianRidge.html", brdocs="https://scikit-learn.org/stable/modules/linear_model.html#bayesian-regression", bernoullinbclass="https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html", @@ -141,6 +143,7 @@ catnbdocs="https://scikit-learn.org/stable/modules/naive_bayes.html#categorical-naive-bayes", complementnbclass="https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.CategoricalNB.html", cnbdocs="https://scikit-learn.org/stable/modules/naive_bayes.html#complement-naive-bayes", + crostonclass="https://www.sktime.net/en/latest/api_reference/auto_generated/sktime.forecasting.croston.Croston.html", decisiontreeclassifier="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html", decisiontreeregressor="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html", treedocs="https://scikit-learn.org/stable/modules/tree.html", @@ -213,9 +216,12 @@ sgdclassifier="https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html", sgdregressor="https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html", sgddocs="https://scikit-learn.org/stable/modules/sgd.html", + stlforecaster="https://www.sktime.net/en/latest/api_reference/auto_generated/sktime.forecasting.trend.STLForecaster.html", svc="https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html", svr="https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html", svmdocs="https://scikit-learn.org/stable/modules/svm.html", + tbatsclass="https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.tbats.TBATS.html", + thetaforecaster="https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.theta.ThetaForecaster.html", xgbclassifier="https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier", xgbregressor="https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRegressor", xgbdocs="https://xgboost.readthedocs.io/en/latest/index.html", @@ -249,7 +255,6 @@ # Classes ========================================================== >> - @dataclass class DummyTrainer: """Dummy trainer class to call model instances.""" @@ -265,6 +270,7 @@ class AutoDocs: The docstring should follow the numpydoc style[^1]. Blocks should start with `::`. The following blocks are accepted: + - toc - tags - head (summary + description) - summary (first line of docstring, required) @@ -374,6 +380,27 @@ def parse_body(body: str) -> str: return text + "\n" + def get_toc(self) -> str: + """Return a toc of the objects in self. + + Note that object must be iterable. + + Returns + ------- + str + Toc of the objects. + + """ + toc = "" + for obj in self.obj: + func = AutoDocs(obj) + + name = f"[{obj.__name__}][] ({obj.acronym})" + toc += f"" + + toc += "
{name}{func.get_summary()}
" + return toc + def get_tags(self) -> str: """Return the object's tags. @@ -386,15 +413,17 @@ def get_tags(self) -> str: """ text = f"[{self.obj.acronym}][predefined-models]{{ .md-tag }}" - if self.obj.needs_scaling: + if getattr(self.obj, "needs_scaling", False): text += "  [needs scaling][automated-feature-scaling]{ .md-tag }" - if self.obj.accepts_sparse: + if getattr(self.obj, "accepts_sparse", False): text += "  [accept sparse][sparse-datasets]{ .md-tag }" - if self.obj.native_multilabel: + if getattr(self.obj, "native_multilabel", False): text += "  [native multilabel][multilabel]{ .md-tag }" - if self.obj.native_multioutput: + if getattr(self.obj, "native_multioutput", False): text += "  [native multioutput][multioutput-tasks]{ .md-tag }" - if self.obj.has_validation: + if getattr(self.obj, "native_multivariate", False): + text += "  [native multivariate][multivariate]{ .md-tag }" + if getattr(self.obj, "validation", None): text += "  [allows validation][in-training-validation]{ .md-tag }" if any(engine not in ("sklearn", "sktime") for engine in self.obj.supports_engines): text += "  [supports acceleration][estimator-acceleration]{ .md-tag }" @@ -484,7 +513,7 @@ def get_description(self) -> str: """ pattern = f".*?(?={'|'.join(self.blocks)})" - match = re.match(pattern, self.doc[len(self.get_summary()) :], re.S) + match = re.match(pattern, self.doc[len(self.get_summary()):], re.S) return match.group() if match else "" def get_see_also(self) -> str: @@ -560,8 +589,7 @@ def get_table(self, blocks: list) -> str: attrs = include else: attrs = [ - m - for m, _ in getmembers(self.obj, lambda x: not isroutine(x)) + m for m, _ in getmembers(self.obj, lambda x: not isroutine(x)) if not m.startswith("_") and not any(re.fullmatch(p, m) for p in config.get("exclude", [])) ] @@ -806,12 +834,14 @@ def render(markdown: str, **kwargs) -> str: if isinstance(command, str): if ":" in command: autodocs = AutoDocs.get_obj(command) - markdown = markdown[: match.start()] + markdown[match.end() :] + markdown = markdown[:match.start()] + markdown[match.end():] continue else: command = {command: None} # Has no options specified - if "tags" in command: + if "toc" in command: + text = autodocs.get_toc() + elif "tags" in command: text = autodocs.get_tags() elif "signature" in command: text = autodocs.get_signature() @@ -840,7 +870,7 @@ def render(markdown: str, **kwargs) -> str: else: text = "" - markdown = markdown[: match.start()] + text + markdown[match.end() :] + markdown = markdown[:match.start()] + text + markdown[match.end():] # Change the custom autorefs now to use [self-...][] markdown = custom_autorefs(markdown, autodocs) diff --git a/docs_sources/user_guide/models.md b/docs_sources/user_guide/models.md index bdd7656de..fa2e61a0e 100644 --- a/docs_sources/user_guide/models.md +++ b/docs_sources/user_guide/models.md @@ -13,53 +13,11 @@ can be accessed using their acronyms, e.g., `atom.LGB` to access the LightGBM model. The available models and their corresponding acronyms are: -* [AdaBoost][] (AdaB) -* [ARIMA][] (Arima) -* [AutoARIMA][] (AutoARIMA) -* [AutomaticRelevanceDetermination][] (ARD) -* [Bagging][] (Bag) -* [BayesianRidge][] (BR) -* [BernoulliNB][] (BNB) -* [CatBoost][] (CatB) -* [CategoricalNB][] (CatNB) -* [ComplementNB][] (CNB) -* [DecisionTree][] (Tree) -* [Dummy][] (Dummy) -* [ElasticNet][] (EN) -* [ETS][] (ETS) -* [ExponentialSmoothing][] (ES) -* [ExtraTree][] (ETree) -* [ExtraTrees][] (ET) -* [GaussianNB][] (GNB) -* [GaussianProcess][] (GP) -* [GradientBoostingMachine][] (GBM) -* [HuberRegression][] (Huber) -* [HistGradientBoosting][] (hGBM) -* [KNearestNeighbors][] (KNN) -* [Lasso][] (Lasso) -* [LeastAngleRegression][] (Lars) -* [LightGBM][] (LGB) -* [LinearDiscriminantAnalysis][] (LDA) -* [LinearSVM][] (lSVM) -* [LogisticRegression][] (LR) -* [MultiLayerPerceptron][] (MLP) -* [MultinomialNB][] (MNB) -* [NaiveForecaster][] (NF) -* [OrdinaryLeastSquares][] (OLS) -* [OrthogonalMatchingPursuit][] (OMP) -* [PassiveAggressive][] (PA) -* [Perceptron][] (Perc) -* [PolynomialTrend][] (PT) -* [QuadraticDiscriminantAnalysis][] (QDA) -* [RadiusNearestNeighbors][] (RNN) -* [RandomForest][] (RF) -* [Ridge][] (Ridge) -* [StochasticGradientDescent][] (SGD) -* [SupportVectorMachine][] (SVM) -* [XGBoost][] (XGB) +:: atom.models:MODELS + :: toc !!! warning - The model classes can not be initialized directly by the user! Use + The model classes cannot be initialized directly by the user! Use them only through atom. !!! tip diff --git a/docs_sources/user_guide/time_series.md b/docs_sources/user_guide/time_series.md index 00e601be7..15381f02c 100644 --- a/docs_sources/user_guide/time_series.md +++ b/docs_sources/user_guide/time_series.md @@ -13,3 +13,13 @@
## Time series regression + + +
+ +## Seasonality + + It refers to the regular and repeating pattern of variation in data + that occurs at specific intervals of time. It's associated with seasonal + effects, which are patterns that tend to recur at consistent intervals. + diff --git a/docs_sources/user_guide/training.md b/docs_sources/user_guide/training.md index 31fa8e384..3114036c6 100644 --- a/docs_sources/user_guide/training.md +++ b/docs_sources/user_guide/training.md @@ -201,7 +201,7 @@ The predefined models that support in-training validation are: * [XGBoost][] To apply in-training validation to a [custom model][custom-models], use the -[`has_validation`][atommodel-has_validation] parameter when creating the +[`validation`][atommodel-validation] parameter when creating the custom model. !!! warning diff --git a/examples/deep_learning.ipynb b/examples/deep_learning.ipynb index 03735284c..ed7302b47 100644 --- a/examples/deep_learning.ipynb +++ b/examples/deep_learning.ipynb @@ -113,7 +113,7 @@ " estimator=ConvNN(verbose=0),\n", " acronym=\"CNN\",\n", " needs_scaling=True, # Applies automated feature scaling before fitting\n", - " has_validation=\"epochs\", # Applies in-training validation on parameter epochs\n", + " validation=\"epochs\", # Applies in-training validation on parameter epochs\n", ")" ] }, diff --git a/examples/in_training_validation.ipynb b/examples/in_training_validation.ipynb index 31c5c31cf..394f11407 100644 --- a/examples/in_training_validation.ipynb +++ b/examples/in_training_validation.ipynb @@ -106,7 +106,7 @@ " \n", " acronym\n", " model\n", - " has_validation\n", + " validation\n", " \n", " \n", " \n", @@ -157,7 +157,7 @@ "" ], "text/plain": [ - " acronym model has_validation\n", + " acronym model validation\n", "3 CatB CatBoost True\n", "15 LGB LightGBM True\n", "19 MLP MultiLayerPerceptron True\n", @@ -175,8 +175,8 @@ "source": [ "# Not all models support in-training validation\n", "# You can chek which ones do using the available_models method\n", - "df = atom.available_models()[[\"acronym\", \"model\", \"has_validation\"]]\n", - "df[df[\"has_validation\"]]" + "df = atom.available_models()[[\"acronym\", \"model\", \"validation\"]]\n", + "df[df[\"validation\"]]" ] }, { diff --git a/examples/multioutput_regression.ipynb b/examples/multioutput_regression.ipynb index b2e079d25..eac6434ee 100644 --- a/examples/multioutput_regression.ipynb +++ b/examples/multioutput_regression.ipynb @@ -515,7 +515,7 @@ "\tneeds_scaling=True\n", "\tnative_multioutput=True\n", "\tnative_multilabel=False\n", - "\thas_validation=None\n", + "\tvalidation=None\n", ")\n" ] } diff --git a/mkdocs.yml b/mkdocs.yml index 3e5c1cf6e..0027ecb3b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -73,11 +73,11 @@ extra: plugins: - search - autorefs - - mkdocs-jupyter: - include_requirejs: true - include: ["*.ipynb"] - include_source: true - ignore_h1_titles: true +# - mkdocs-jupyter: +# include_requirejs: true +# include: ["*.ipynb"] +# include_source: true +# ignore_h1_titles: true - mkdocs-simple-hooks: hooks: on_page_markdown: docs_sources.scripts.autodocs:render @@ -93,12 +93,12 @@ markdown_extensions: - pymdownx.details # Allows for collapsible admonition blocks - pymdownx.magiclink # Allows for bare links of the form - pymdownx.inlinehilite # Allows for inline highlighting of code blocks - - pymdownx.superfences: - preserve_tabs: true - custom_fences: - - name: pycon - class: pycon - format: !!python/name:docs_sources.scripts.autorun.formatter +# - pymdownx.superfences: +# preserve_tabs: true +# custom_fences: +# - name: pycon +# class: pycon +# format: !!python/name:docs_sources.scripts.autorun.formatter - pymdownx.arithmatex: # Allows for rendering of equations generic: true - pymdownx.tabbed: # Allows the usage of content tabs @@ -179,11 +179,13 @@ nav: - AutoARIMA: API/models/autoarima.md - AutomaticRelevanceDetermination: API/models/ard.md - Bagging: API/models/bag.md + - BATS: API/models/bats.md - BayesianRidge: API/models/br.md - BernoulliNB: API/models/bnb.md - CatBoost: API/models/catb.md - CategoricalNB: API/models/catnb.md - ComplementNB: API/models/cnb.md + - Croston: API/models/croston.md - DecisionTree: API/models/tree.md - Dummy: API/models/dummy.md - ElasticNet: API/models/en.md @@ -216,7 +218,10 @@ nav: - RandomForest: API/models/rf.md - Ridge: API/models/ridge.md - StochasticGradientDescent: API/models/sgd.md + - STL: API/models/stl.md - SupportVectorMachine: API/models/svm.md + - TBATS: API/models/tbats.md + - Theta: API/models/theta.md - XGBoost: API/models/xgb.md - Pipeline: - Pipeline: API/pipeline/pipeline.md diff --git a/pyproject.toml b/pyproject.toml index c05abd329..643cec65e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ full = [ "pmdarima>=2.0.3", "schemdraw>=0.16", "sweetviz>=2.3.1", + "tbats>=1.1.3", "wordcloud>=1.9.2", "xgboost>=2.0.0", ] diff --git a/tests/test_api.py b/tests/test_api.py index 88a64efe2..c4ef8db58 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -28,7 +28,7 @@ def test_atommodel(): assert model.acronym == "huber" assert model.needs_scaling is True assert model.native_multioutput is False - assert model.has_validation is None + assert model.validation is None def test_atomclassifier(): diff --git a/tests/test_basemodel.py b/tests/test_basemodel.py index 3bf436f6f..ed82a8385 100644 --- a/tests/test_basemodel.py +++ b/tests/test_basemodel.py @@ -25,12 +25,12 @@ from sklearn.multioutput import ClassifierChain from sklearn.tree import DecisionTreeClassifier -from atom import ATOMClassifier, ATOMModel, ATOMRegressor +from atom import ATOMClassifier, ATOMForecaster, ATOMModel, ATOMRegressor from atom.utils.utils import check_is_fitted, check_scaling from .conftest import ( X10_str, X_bin, X_class, X_idx, X_label, X_reg, y10, y10_str, y_bin, - y_class, y_idx, y_label, y_multiclass, y_reg, + y_class, y_fc, y_idx, y_label, y_multiclass, y_reg, ) @@ -44,13 +44,20 @@ def test_scaler(): assert not atom.lda.scaler -def test_str(): +def test_repr(): """Assert that the __repr__ method works as intended.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) atom.run("LDA") assert str(atom.lda).startswith("LinearDiscriminantAnalysis") +def test_dir(): + """Assert that __dir__ contains all the extra attributes.""" + atom = ATOMClassifier(X_bin, y_bin, random_state=1) + atom.run("dummy") + assert all(attr in dir(atom.dummy) for attr in ("y", "mean radius", "head")) + + def test_getattr(): """Assert that branch attributes can be called from a model.""" atom = ATOMClassifier(X_class, y_class, random_state=1) @@ -638,90 +645,6 @@ def test_all_property(): assert len(atom.ols._all) == len(X_bin) -# Test prediction methods ========================================== >> - -def test_predictions_from_index(): - """Assert that predictions can be made from data indices.""" - atom = ATOMClassifier(X_idx, y_idx, index=True, holdout_size=0.1, random_state=1) - atom.run("LR") - assert isinstance(atom.lr.decision_function(("index_4", "index_5")), pd.Series) - assert isinstance(atom.lr.predict(["index_4", "index_8"]), pd.Series) - assert isinstance(atom.lr.predict_log_proba(-100), pd.DataFrame) - assert isinstance(atom.lr.predict_proba("index_4"), pd.DataFrame) - assert isinstance(atom.lr.score(slice(10, 20)), float) - - -def test_transformations_first(): - """Assert that the transformations are applied before predicting.""" - atom = ATOMClassifier(X10_str, y10, random_state=1) - atom.encode(max_onehot=None) - atom.run("Tree") - assert isinstance(atom.tree.predict(X10_str), pd.Series) - - -def test_data_is_scaled(): - """Assert that the data is scaled for models that need it.""" - atom = ATOMClassifier(X_bin, y_bin, random_state=1) - atom.run("LR") - assert sum(atom.lr.predict(X_bin)) > 0 # Always 0 if not scaled - - -def test_predictions_from_new_data(): - """Assert that predictions can be made from new data.""" - atom = ATOMClassifier(X_bin, y_bin, random_state=1) - atom.run("LR") - assert isinstance(atom.lr.predict(X_bin), pd.Series) - assert isinstance(atom.lr.predict_proba(X_bin), pd.DataFrame) - - -def test_prediction_from_multioutput(): - """Assert that predictions can be made for multioutput datasets.""" - atom = ATOMClassifier(X_class, y=y_multiclass, random_state=1) - atom.run("LR") - assert isinstance(atom.lr.predict_proba(X_class).index, pd.MultiIndex) - - -def test_prediction_inverse_transform(): - """Assert that the predict method can return the inversely transformed data.""" - atom = ATOMRegressor(X_reg, y_reg, random_state=1) - atom.scale(columns=-1) - atom.run("Tree") - assert check_scaling(atom.tree.predict(X_reg, inverse=False)) - assert not check_scaling(atom.tree.predict(X_reg, inverse=True)) - - -def test_score_regression(): - """Assert that the score returns r2 for regression tasks.""" - atom = ATOMRegressor(X_reg, y_reg, shuffle=False, random_state=1) - atom.run("Tree") - r2 = r2_score(y_reg, atom.tree.predict(X_reg)) - assert atom.tree.score(X_reg, y_reg) == r2 - - -def test_score_metric_is_None(): - """Assert that the score method returns the default metric.""" - atom = ATOMClassifier(X_bin, y_bin, shuffle=False, random_state=1) - atom.run("Tree") - f1 = f1_score(y_bin, atom.tree.predict(X_bin)) - assert atom.tree.score(X_bin, y_bin) == f1 - - -def test_score_custom_metric(): - """Assert that the score method works when sample weights are provided.""" - atom = ATOMClassifier(X_bin, y_bin, shuffle=False, random_state=1) - atom.run("Tree") - recall = recall_score(y_bin, atom.tree.predict(X_bin)) - assert atom.tree.score(X_bin, y_bin, metric="recall") == recall - - -def test_score_with_sample_weight(): - """Assert that the score method works when sample weights are provided.""" - atom = ATOMClassifier(X_bin, y_bin, random_state=1) - atom.run("Tree") - score = atom.tree.score(X_bin, y_bin, sample_weight=list(range(len(y_bin)))) - assert isinstance(score, float) - - # Test utility methods ============================================= >> def test_calibrate_invalid_task(): @@ -1003,3 +926,103 @@ def test_transform(): X = atom.lr.transform(X10_str) assert len(X.columns) > 3 # Data is one-hot encoded assert all(-3 <= v <= 3 for v in X.to_numpy().ravel()) # Data is scaled + + +# Test ClassRegModel ================================================== >> + +def test_classreg_get_tags(): + """Assert that the get_tags method returns the tags.""" + atom = ATOMClassifier(X_bin, y_bin, random_state=1) + atom.run("LR") + assert isinstance(atom.lr.get_tags(), dict) + + +def test_predictions_from_index(): + """Assert that predictions can be made from data indices.""" + atom = ATOMClassifier(X_idx, y_idx, index=True, holdout_size=0.1, random_state=1) + atom.run("LR") + assert isinstance(atom.lr.decision_function(("index_4", "index_5")), pd.Series) + assert isinstance(atom.lr.predict(["index_4", "index_8"]), pd.Series) + assert isinstance(atom.lr.predict_log_proba(-100), pd.DataFrame) + assert isinstance(atom.lr.predict_proba("index_4"), pd.DataFrame) + assert isinstance(atom.lr.score(slice(10, 20)), float) + + +def test_transformations_first(): + """Assert that the transformations are applied before predicting.""" + atom = ATOMClassifier(X10_str, y10, random_state=1) + atom.encode(max_onehot=None) + atom.run("Tree") + assert isinstance(atom.tree.predict(X10_str), pd.Series) + + +def test_data_is_scaled(): + """Assert that the data is scaled for models that need it.""" + atom = ATOMClassifier(X_bin, y_bin, random_state=1) + atom.run("LR") + assert sum(atom.lr.predict(X_bin)) > 0 # Always 0 if not scaled + + +def test_predictions_from_new_data(): + """Assert that predictions can be made from new data.""" + atom = ATOMClassifier(X_bin, y_bin, random_state=1) + atom.run("LR") + assert isinstance(atom.lr.predict(X_bin), pd.Series) + assert isinstance(atom.lr.predict_proba(X_bin), pd.DataFrame) + + +def test_prediction_from_multioutput(): + """Assert that predictions can be made for multioutput datasets.""" + atom = ATOMClassifier(X_class, y=y_multiclass, random_state=1) + atom.run("LR") + assert isinstance(atom.lr.predict_proba(X_class).index, pd.MultiIndex) + + +def test_prediction_inverse_transform(): + """Assert that the predict method can return the inversely transformed data.""" + atom = ATOMRegressor(X_reg, y_reg, random_state=1) + atom.scale(columns=-1) + atom.run("Tree") + assert check_scaling(atom.tree.predict(X_reg, inverse=False)) + assert not check_scaling(atom.tree.predict(X_reg, inverse=True)) + + +def test_score_regression(): + """Assert that the score returns r2 for regression tasks.""" + atom = ATOMRegressor(X_reg, y_reg, shuffle=False, random_state=1) + atom.run("Tree") + r2 = r2_score(y_reg, atom.tree.predict(X_reg)) + assert atom.tree.score(X_reg, y_reg) == r2 + + +def test_score_metric_is_None(): + """Assert that the score method returns the default metric.""" + atom = ATOMClassifier(X_bin, y_bin, shuffle=False, random_state=1) + atom.run("Tree") + f1 = f1_score(y_bin, atom.tree.predict(X_bin)) + assert atom.tree.score(X_bin, y_bin) == f1 + + +def test_score_custom_metric(): + """Assert that the score method works when sample weights are provided.""" + atom = ATOMClassifier(X_bin, y_bin, shuffle=False, random_state=1) + atom.run("Tree") + recall = recall_score(y_bin, atom.tree.predict(X_bin)) + assert atom.tree.score(X_bin, y_bin, metric="recall") == recall + + +def test_score_with_sample_weight(): + """Assert that the score method works when sample weights are provided.""" + atom = ATOMClassifier(X_bin, y_bin, random_state=1) + atom.run("Tree") + score = atom.tree.score(X_bin, y_bin, sample_weight=list(range(len(y_bin)))) + assert isinstance(score, float) + + +# Test ForecastModel =============================================== >> + +def test_forecast_get_tags(): + """Assert that the get_tags method returns the tags.""" + atom = ATOMForecaster(y_fc, random_state=1) + atom.run("NF") + assert isinstance(atom.nf.get_tags(), dict) diff --git a/tests/test_baserunner.py b/tests/test_baserunner.py index 010c145f7..02eb0a419 100644 --- a/tests/test_baserunner.py +++ b/tests/test_baserunner.py @@ -41,6 +41,13 @@ def test_getstate_and_setstate(): ATOMClassifier.load("atom") +def test_dir(): + """Assert that __dir__ contains all the extra attributes.""" + atom = ATOMClassifier(X_bin, y_bin, random_state=1) + atom.run("dummy") + assert all(attr in dir(atom) for attr in ("X", "main", "mean radius", "dummy")) + + def test_getattr_branch(): """Assert that branches can be called.""" atom = ATOMClassifier(X_bin, y_bin, random_state=1) diff --git a/tests/test_models.py b/tests/test_models.py index 834b7ad5c..72044a119 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -14,10 +14,10 @@ from optuna.pruners import PatientPruner from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from atom import ATOMClassifier, ATOMModel, ATOMRegressor +from atom import ATOMClassifier, ATOMForecaster, ATOMModel, ATOMRegressor from atom.pipeline import Pipeline -from .conftest import X_bin, X_class, X_reg, y_bin, y_class, y_reg +from .conftest import X_bin, X_class, X_reg, y_bin, y_class, y_fc, y_reg def test_custom_model_properties(): @@ -103,6 +103,12 @@ def test_all_models_regression(): ) +def test_all_models_forecast(): + """Assert that all models work with forecast.""" + atom = ATOMForecaster(y_fc, random_state=2) + atom.run(models=None, n_trials=5, errors="raise") + + @pytest.mark.skipif(machine() not in ("x86_64", "AMD64"), reason="Only x86 support") def test_models_sklearnex_classification(): """Assert the sklearnex engine works for classification tasks."""