add prophet

tvdboom · Dec 19, 2023 · 3efd838 · 3efd838
1 parent bafd417
commit 3efd838
Show file tree

Hide file tree

Showing 17 changed files with 832 additions and 83 deletions.
diff --git a/atom/basemodel.py b/atom/basemodel.py
@@ -267,7 +267,7 @@ def __dir__(self) -> list[str]:
         if "_branch" in self.__dict__:
             attrs += [x for x in dir(self.branch) if not x.startswith("_")]
             attrs += list(DF_ATTRS)
-            attrs += list(self.columns)
+            attrs += [c for c in self.columns if re.fullmatch(r"\w+$", c)]
         return attrs
 
     def __getattr__(self, item: str) -> Any:
@@ -694,7 +694,7 @@ def _score_from_est(
         X: DataFrame,
         y: Pandas,
         **kwargs,
-    ) -> float:
+    ) -> Float:
         """Calculate the metric score from an estimator.
 
         Parameters
@@ -737,7 +737,7 @@ def _score_from_pred(
         y_true: Pandas,
         y_pred: Pandas,
         **kwargs,
-    ) -> float:
+    ) -> Float:
         """Calculate the metric score from predicted values.
 
         Since sklearn metrics don't support multiclass-multioutput
@@ -770,12 +770,15 @@ def _score_from_pred(
         if self.task.is_forecast and all(x.isna()[0] for x in get_cols(y_pred)):
             y_true, y_pred = y_true.iloc[1:], y_pred.iloc[1:]
 
-        if self.task is Task.multiclass_multioutput_classification:
-            # Get the mean of the scores over the target columns
-            scores = [scorer._sign * func(y_true[c], y_pred[c]) for c in y_pred.columns]
-            return float(np.mean(scores, axis=0))
-        else:
-            return float(scorer._sign * func(y_true, y_pred))
+        try:
+            if self.task is Task.multiclass_multioutput_classification:
+                # Get the mean of the scores over the target columns
+                scores = [scorer._sign * func(y_true[c], y_pred[c]) for c in y_pred.columns]
+                return np.mean(scores, axis=0)
+            else:
+                return scorer._sign * func(y_true, y_pred)
+        except ValueError:
+            return np.NaN  # Some forecast models predict NaN
 
     def _get_score(
         self,
@@ -885,7 +888,7 @@ def fit_model(
                 estimator: Predictor,
                 train_idx: np.ndarray,
                 val_idx: np.ndarray,
-            ) -> tuple[Predictor, list[float]]:
+            ) -> tuple[Predictor, list[Float]]:
                 """Fit the model. Function for parallelization.
 
                 Divide the training set in a (sub) train and validation

diff --git a/atom/baserunner.py b/atom/baserunner.py
@@ -83,7 +83,7 @@ def __dir__(self) -> list[str]:
         attrs += [x for x in dir(self.branch) if not x.startswith("_")]
         attrs += list(DF_ATTRS)
         attrs += [b.name.lower() for b in self._branches]
-        attrs += list(self.columns)
+        attrs += [c for c in self.columns if re.fullmatch(r"\w+$", c)]
         if isinstance(self._models, ClassMap):
             attrs += [m.name.lower() for m in self._models]
         return attrs
@@ -163,25 +163,25 @@ def sp(self) -> int | list[int] | None:
         Read more about seasonality in the [user guide][seasonality].
 
         """
-        return self._sp
+        return self._config.sp
 
     @sp.setter
     def sp(self, sp: Seasonality):
         """Convert seasonal period to integer value."""
         if sp is None:
-            self._sp = None
+            self._config.sp = None
         elif sp == "index":
             if not hasattr(self.dataset.index, "freqstr"):
                 raise ValueError(
                     f"Invalid value for the seasonal period, got {sp}. "
                     f"The dataset's index has no attribute freqstr."
                 )
             else:
-                self._sp = self._get_sp(self.dataset.index.freqstr)
+                self._config.sp = self._get_sp(self.dataset.index.freqstr)
         elif sp == "infer":
-            self._sp = self.get_seasonal_period()
+            self._config.sp = self.get_seasonal_period()
         else:
-            self._sp = flt([self._get_sp(x) for x in lst(sp)])
+            self._config.sp = flt([self._get_sp(x) for x in lst(sp)])
 
     @property
     def og(self) -> Branch:
@@ -901,15 +901,15 @@ def available_models(self) -> pd.DataFrame:
             - **fullname:** Name of the model's class.
             - **estimator:** Class of the model's underlying estimator.
             - **module:** The estimator's module.
-            - **handles_missing:** Whether the model can handle `NaN` values
-              without preprocessing.
+            - **handles_missing:** Whether the model can handle missing
+              (`NaN`) values without preprocessing. If False, consider using
+              the [Imputer][] class before training the models.
             - **needs_scaling:** Whether the model requires feature scaling.
-            - **accepts_sparse:** Whether the model accepts sparse matrices.
-            - **uses_exogenous:** Whether the model uses exogenous variables.
-            - **in_sample_prediction:** Whether the model can do predictions
-              on the training set.
+              If True, [automated feature scaling][] is applied.
+            - **accepts_sparse:** Whether the model accepts [sparse input][sparse-datasets].
+            - **uses_exogenous:** Whether the model uses [exogenous variables][].
             - **multiple_seasonality:** Whether the model can handle more than
-              one [seasonality periods][seasonality].
+              one [seasonality period][seasonality].
             - **native_multilabel:** Whether the model has native support
               for [multilabel][] tasks.
             - **native_multioutput:** Whether the model has native support

diff --git a/atom/basetrainer.py b/atom/basetrainer.py
@@ -197,6 +197,7 @@ def _prepare_parameters(self):
                             "BATS": "tbats",
                             "CatB": "catboost",
                             "LGB": "lightgbm",
+                            "MSTL": "statsforecast",
                             "TBATS": "tbats",
                             "XGB": "xgboost",
                         }

diff --git a/atom/basetransformer.py b/atom/basetransformer.py
@@ -371,9 +371,14 @@ def _inherit(self, obj: T_Estimator) -> T_Estimator:
 
         """
         signature = sign(obj.__init__)  # type: ignore[misc]
-        for p in ("sp", "n_jobs", "random_state"):
-            if p in signature and getattr(obj, p, "<!>") == signature[p]._default:
-                setattr(obj, p, getattr(self, p, signature[p]._default))
+        for p in ("n_jobs", "random_state"):
+            if p in signature and obj.get_params()[p] == signature[p]._default:
+                obj.set_params(**{p: getattr(self, p)})
+
+        # Add seasonal period to the estimator
+        if self._config.sp:
+            if "sp" in signature and obj.get_params()["sp"] == signature["sp"]._default:
+                obj.set_params(sp=self._config.sp)
 
         return obj
 

diff --git a/atom/models/__init__.py b/atom/models/__init__.py
@@ -20,8 +20,9 @@
 from atom.models.custom import CustomModel
 from atom.models.ensembles import Stacking, Voting
 from atom.models.ts import (
-    ARIMA, BATS, ETS, STL, TBATS, AutoARIMA, Croston, ExponentialSmoothing,
-    NaiveForecaster, PolynomialTrend, Theta,
+    ARIMA, BATS, ETS, MSTL, SARIMAX, STL, TBATS, VAR, VARMAX, AutoARIMA,
+    Croston, DynamicFactor, ExponentialSmoothing, NaiveForecaster,
+    PolynomialTrend, Prophet, Theta,
 )
 from atom.utils.types import Predictor
 from atom.utils.utils import ClassMap
@@ -43,6 +44,7 @@
     Croston,
     DecisionTree,
     Dummy,
+    DynamicFactor,
     ElasticNet,
     ETS,
     ExponentialSmoothing,
@@ -60,23 +62,28 @@
     LinearDiscriminantAnalysis,
     LinearSVM,
     LogisticRegression,
+    MSTL,
     MultiLayerPerceptron,
     MultinomialNB,
     NaiveForecaster,
     OrdinaryLeastSquares,
     OrthogonalMatchingPursuit,
     PassiveAggressive,
     Perceptron,
+    Prophet,
     PolynomialTrend,
     QuadraticDiscriminantAnalysis,
     RadiusNearestNeighbors,
     RandomForest,
     Ridge,
+    SARIMAX,
     STL,
     StochasticGradientDescent,
     SupportVectorMachine,
     TBATS,
     Theta,
+    VAR,
+    VARMAX,
     XGBoost,
     key="acronym",
 )

diff --git a/atom/models/classreg.py b/atom/models/classreg.py
@@ -415,7 +415,7 @@ class CatBoost(ClassRegModel):
     """
 
     acronym = "CatB"
-    handles_missing = False
+    handles_missing = True
     needs_scaling = True
     accepts_sparse = True
     native_multilabel = False
@@ -1640,7 +1640,7 @@ class LightGBM(ClassRegModel):
     """
 
     acronym = "LGB"
-    handles_missing = False
+    handles_missing = True
     needs_scaling = True
     accepts_sparse = True
     native_multilabel = False
@@ -2165,7 +2165,7 @@ def _trial_to_est(self, params: dict[str, Any]) -> dict[str, Any]:
         hidden_layer_sizes = [
             value
             for param in [p for p in sorted(params) if p.startswith("hidden_layer")]
-            if (value := params.pop(param))  # Neurons should be more than zero
+            if (value := params.pop(param))  # Neurons should be >0
         ]
 
         if hidden_layer_sizes:
@@ -3078,7 +3078,7 @@ class XGBoost(ClassRegModel):
     """
 
     acronym = "XGB"
-    handles_missing = False
+    handles_missing = True
     needs_scaling = True
     accepts_sparse = True
     native_multilabel = False