plot_bootstrap

tvdboom · Jan 18, 2024 · c6eb43f · c6eb43f
1 parent 7b810c2
commit c6eb43f
Show file tree

Hide file tree

Showing 36 changed files with 18,226 additions and 3,211 deletions.
diff --git a/.github/workflows/config.yml b/.github/workflows/config.yml
@@ -99,10 +99,10 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install -U pip setuptools
-          pip install -U pytest pytest-mock pytest-cov
+          pip install -U pytest pytest-mock pytest-cov pytest-xdist
           pip install -e .[full]
       - name: Run tests w/ coverage
-        run: pytest --cov=atom --cov-report=xml tests/
+        run: pytest -n=auto --cov=atom --cov-report=xml tests/
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3
         if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' }}

diff --git a/atom/atom.py b/atom/atom.py
@@ -1050,7 +1050,7 @@ def stats(self, _vb: Int = -2, /):
             if hasattr(self.X, "sparse"):  # All columns are sparse
                 self._log(f"Density: {100. * self.X.sparse.density:.2f}%", _vb)
             else:  # Not all columns are sparse
-                n_sparse = sum([pd.api.types.is_sparse(self.X[c]) for c in self.X])
+                n_sparse = sum(isinstance(self[c].dtype, pd.SparseDtype) for c in self.features)
                 n_dense = self.n_features - n_sparse
                 p_sparse = round(100 * n_sparse / self.n_features, 1)
                 p_dense = round(100 * n_dense / self.n_features, 1)
@@ -1193,14 +1193,13 @@ def _add_transformer(
         transformed and the transformer is added to atom's
         pipeline.
 
-        If the transformer has the n_jobs and/or random_state
-        parameters and they are left to their default value,
-        they adopt atom's values.
-
         Parameters
         ----------
         transformer: Transformer
             Estimator to add. Should implement a `transform` method.
+            If a class is provided (instead of an instance), and it
+            has the `n_jobs` and/or `random_state` parameters, it
+            adopts atom's values.
 
         columns: int, str, segment, sequence or None, default=None
             Columns in the dataset to transform. If None, transform
@@ -1220,7 +1219,7 @@ def _add_transformer(
 
         """
         if callable(transformer):
-            transformer_c = transformer()
+            transformer_c = self._inherit(transformer())
         else:
             transformer_c = transformer
 
@@ -1231,9 +1230,6 @@ def _add_transformer(
                 "new branch to continue the pipeline."
             )
 
-        # Add BaseTransformer params to the estimator if left to default
-        transformer_c = self._inherit(transformer_c)
-
         if not hasattr(transformer_c, "_train_only"):
             transformer_c._train_only = train_only
 
@@ -1385,16 +1381,13 @@ def add(
               This means that the transformer should not add, remove or
               shuffle rows unless it returns a dataframe.
 
-        !!! note
-            If the transformer has a `n_jobs` and/or `random_state`
-            parameter that is left to its default value, it adopts
-            atom's value.
-
         Parameters
         ----------
         transformer: Transformer
             Estimator to add to the pipeline. Should implement a
-            `transform` method.
+            `transform` method. If a class is provided (instead of an
+            instance), and it has the `n_jobs` and/or `random_state`
+            parameters, it adopts atom's values.
 
         columns: int, str, segment, sequence or None, default=None
             [Selection of columns][row-and-column-selection] to

diff --git a/atom/basemodel.py b/atom/basemodel.py
@@ -448,32 +448,38 @@ def _get_est(self, params: dict[str, Any]) -> Predictor:
         """
         # Separate the params for the estimator from those in sub-estimators
         base_params, sub_params = {}, {}
-        if params:
-            for name, value in params.items():
-                if "__" not in name:
-                    base_params[name] = value
-                else:
-                    sub_params[name] = value
+        for name, value in params.items():
+            if "__" not in name:
+                base_params[name] = value
+            else:
+                sub_params[name] = value
 
         estimator = self._est_class(**base_params).set_params(**sub_params)
 
+        fixed = tuple(params)
         if hasattr(self, "task"):
-            if self.task is Task.multilabel_classification:
+            if self.task.is_forecast and self._goal.name not in self._estimators:
+                fixed = tuple(f"estimator__{f}" for f in fixed)
+
+                # Forecasting task with a regressor
+                if self.task.is_multioutput:
+                    estimator = make_reduction(estimator, strategy="multioutput")
+                else:
+                    estimator = make_reduction(estimator, strategy="recursive")
+
+            elif self.task is Task.multilabel_classification:
                 if not self.native_multilabel:
+                    fixed = tuple(f"base_estimator__{f}" for f in fixed)
                     estimator = ClassifierChain(estimator)
+
             elif self.task.is_multioutput and not self.native_multioutput:
+                fixed = tuple(f"estimator__{f}" for f in fixed)
                 if self.task.is_classification:
                     estimator = MultiOutputClassifier(estimator)
                 elif self.task.is_regression:
                     estimator = MultiOutputRegressor(estimator)
-            elif self.task.is_forecast and self._goal.name not in self._estimators:
-                # Forecasting task with a regressor
-                if self.native_multioutput:
-                    estimator = make_reduction(estimator, strategy="multioutput")
-                else:
-                    estimator = make_reduction(estimator, strategy="recursive")
 
-        return self._inherit(estimator)
+        return self._inherit(estimator, fixed=fixed)
 
     def _fit_estimator(
         self,
@@ -1550,7 +1556,7 @@ def results(self) -> pd.Series:
                 data[f"{met}_ht"] = self.trials.loc[self.best_trial.number, met]
             data["time_ht"] = self.trials.iloc[-1, -2]
         for met in self._metric:
-            for ds in ("train", "test"):
+            for ds in ["train", "test"] + ([] if self.holdout is None else ["holdout"]):
                 data[f"{met.name}_{ds}"] = self._get_score(met, ds)
         data["time_fit"] = self._time_fit
         if self._bootstrap is not None:
@@ -1975,7 +1981,7 @@ def cross_validate(self, **kwargs) -> pd.DataFrame:
         **kwargs
             Additional keyword arguments for one of these functions.
 
-            - For forecast tasks: [validate][sktimevalidate].
+            - For forecast tasks: [evaluate][sktimeevaluate].
             - Else: [cross_validate][sklearncrossvalidate].
 
         Returns

diff --git a/atom/basetransformer.py b/atom/basetransformer.py
@@ -9,6 +9,7 @@
 
 import os
 import random
+import re
 import tempfile
 import warnings
 from collections.abc import Hashable
@@ -31,16 +32,14 @@
 from joblib.memory import Memory
 from pandas._typing import Axes
 from ray.util.joblib import register_ray
-from sklearn.base import BaseEstimator
 from sklearn.utils.validation import check_memory
 
-from atom.utils.constants import SHARED_PARAMS
 from atom.utils.types import (
     Backend, Bool, DataFrame, Engine, Estimator, Int, IntLargerEqualZero,
     Pandas, Sequence, Severity, Verbose, Warnings, XSelector, YSelector,
     bool_t, dataframe_t, int_t, sequence_t,
 )
-from atom.utils.utils import crash, flt, lst, n_cols, sign, to_df, to_pandas
+from atom.utils.utils import crash, flt, lst, n_cols, to_df, to_pandas
 
 
 T_Estimator = TypeVar("T_Estimator", bound=Estimator)
@@ -354,36 +353,34 @@ def _device_id(self) -> int:
 
     # Methods ====================================================== >>
 
-    def _inherit(self, obj: T_Estimator) -> T_Estimator:
+    def _inherit(self, obj: T_Estimator, fixed: tuple[str, ...] = ()) -> T_Estimator:
         """Inherit parameters from parent.
 
         Utility method to set the sp (seasonal period), n_jobs and
         random_state parameters of an estimator (if available) equal
         to that of this instance. If `obj` is a meta-estimator, it
-        also sets the parameters to the base estimator. Note that
-        the parameters are only changed when the value is equal to
-        the constructor's default value.
+        also adjusts the parameters of the base estimator.
 
         Parameters
         ----------
         obj: Estimator
             Instance for which to change the parameters.
 
+        fixed: tuple of str, default=()
+            Fixed parameters that should not be overriden.
+
         Returns
         -------
         Estimator
             Same object with changed parameters.
 
         """
-        signature = sign(obj.__class__)
-        for p, value in obj.get_params().items():
-            if p in SHARED_PARAMS and (p not in signature or value == signature[p]._default):
-                # Some estimators like XGB use kwargs, so param
-                # isn't in signature. In that case, always override
-                obj.set_params(**{p: getattr(self, p)})
-            elif isinstance(value, BaseEstimator):
-                obj.set_params(**{p: self._inherit(value)})
-            elif p == "sp" and hasattr(self, "_config") and self._config.sp:
+        for p in obj.get_params():
+            if p in fixed:
+                continue
+            elif match := re.search("(n_jobs|random_state)$|__\1$", p):
+                obj.set_params(**{p: getattr(self, match.group())})
+            elif re.search(r"sp$|__sp$", p) and hasattr(self, "_config") and self._config.sp:
                 if self.multiple_seasonality:
                     obj.set_params(**{p: self._config.sp})
                 else:

diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py
@@ -406,14 +406,15 @@ def fit(self, X: DataFrame, y: Pandas = -1) -> Self:
                     f"Invalid value for the strategy parameter, got {self.strategy}. "
                     f"Choose from: {', '.join(strategies)}."
                 )
-            estimator = strategies[self.strategy.lower()](**self.kwargs)
+            est_class = strategies[self.strategy.lower()]
+            estimator = self._inherit(est_class(**self.kwargs), fixed=tuple(self.kwargs))
         elif not hasattr(self.strategy, "fit_resample"):
             raise TypeError(
                 "Invalid type for the strategy parameter. A "
                 "custom estimator must have a fit_resample method."
             )
         elif callable(self.strategy):
-            estimator = self.strategy(**self.kwargs)
+            estimator = self._inherit(self.strategy(**self.kwargs), fixed=tuple(self.kwargs))
         else:
             estimator = self.strategy
 
@@ -425,8 +426,7 @@ def fit(self, X: DataFrame, y: Pandas = -1) -> Self:
         for key, value in self.mapping_.items():
             self._counts[key] = np.sum(y == value)
 
-        # Add n_jobs or random_state if its one of the estimator's parameters
-        self._estimator = self._inherit(estimator).fit(X, y)
+        self._estimator = estimator.fit(X, y)
 
         # Add the estimator as attribute to the instance
         setattr(self, f"{estimator.__class__.__name__.lower()}_", self._estimator)

diff --git a/atom/models/classreg.py b/atom/models/classreg.py
@@ -1598,7 +1598,7 @@ class LeastAngleRegression(BaseModel):
 class LightGBM(BaseModel):
     """Light Gradient Boosting Machine.
 
-    LightGBM is a gradient boosting model that uses tree based learning
+    LightGBM is a gradient boosting model that uses tree-based learning
     algorithms. It is designed to be distributed and efficient with the
     following advantages:
 

diff --git a/atom/models/ts.py b/atom/models/ts.py
@@ -80,7 +80,7 @@ class ARIMA(BaseModel):
     handles_missing = True
     uses_exogenous = True
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -244,7 +244,7 @@ class AutoARIMA(BaseModel):
     handles_missing = True
     uses_exogenous = True
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -336,7 +336,7 @@ class BATS(BaseModel):
     handles_missing = False
     uses_exogenous = False
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -417,7 +417,7 @@ class Croston(BaseModel):
     handles_missing = False
     uses_exogenous = True
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -543,7 +543,7 @@ class ExponentialSmoothing(BaseModel):
     handles_missing = False
     uses_exogenous = False
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -629,7 +629,7 @@ class ETS(BaseModel):
     handles_missing = True
     uses_exogenous = False
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -723,7 +723,7 @@ class MSTL(BaseModel):
     handles_missing = False
     uses_exogenous = False
     multiple_seasonality = True
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -825,7 +825,7 @@ class NaiveForecaster(BaseModel):
     handles_missing = True
     uses_exogenous = False
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -880,7 +880,7 @@ class PolynomialTrend(BaseModel):
     handles_missing = False
     uses_exogenous = False
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -940,7 +940,7 @@ class Prophet(BaseModel):
     handles_missing = False
     uses_exogenous = False
     multiple_seasonality = True
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -1046,7 +1046,7 @@ class SARIMAX(BaseModel):
     handles_missing = False
     uses_exogenous = True
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -1183,7 +1183,7 @@ class STL(BaseModel):
     handles_missing = False
     uses_exogenous = False
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -1260,7 +1260,7 @@ class TBATS(BaseModel):
     handles_missing = False
     uses_exogenous = False
     multiple_seasonality = True
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {
@@ -1345,7 +1345,7 @@ class Theta(BaseModel):
     handles_missing = False
     uses_exogenous = False
     multiple_seasonality = False
-    native_multioutput = False
+    native_multioutput = True
     supports_engines = ("sktime",)
 
     _estimators: ClassVar[dict[str, str]] = {