diff --git a/README.md b/README.md
index 6cc7a7d11..469065610 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@
**General Information** | |
--- | ---
-**Repository** | [![Project Status: Active](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![Conda Recipe](https://img.shields.io/badge/recipe-atom--ml-green.svg)](https://anaconda.org/conda-forge/atom-ml) [![License: MIT](https://img.shields.io/github/license/tvdboom/ATOM)](https://opensource.org/licenses/MIT) [![Downloads](https://pepy.tech/badge/atom-ml)](https://pepy.tech/project/atom-ml)
+**Repository** | [![Project Status: Active](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![Conda Recipe](https://img.shields.io/badge/recipe-atom--ml-green.svg)](https://anaconda.org/conda-forge/atom-ml) [![License: MIT](https://img.shields.io/github/license/tvdboom/ATOM)](https://opensource.org/licenses/MIT) [![Downloads](https://static.pepy.tech/badge/atom-ml)](https://pepy.tech/project/atom-ml)
**Release** | [![pdm-managed](https://img.shields.io/badge/pdm-managed-blueviolet)](https://pdm.fming.dev) [![PyPI version](https://img.shields.io/pypi/v/atom-ml)](https://pypi.org/project/atom-ml/) [![Conda Version](https://img.shields.io/conda/vn/conda-forge/atom-ml.svg)](https://anaconda.org/conda-forge/atom-ml) [![DOI](https://zenodo.org/badge/195069958.svg)](https://zenodo.org/badge/latestdoi/195069958)
**Compatibility** | [![Python 3.8\|3.9\|3.10\|3.11](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue?logo=python)](https://www.python.org) [![Conda Platforms](https://img.shields.io/conda/pn/conda-forge/atom-ml.svg)](https://anaconda.org/conda-forge/atom-ml)
**Build status** | [![Build Status](https://github.com/tvdboom/ATOM/workflows/ATOM/badge.svg)](https://github.com/tvdboom/ATOM/actions) [![Azure Pipelines](https://dev.azure.com/conda-forge/feedstock-builds/_apis/build/status/atom-ml-feedstock?branchName=master)](https://dev.azure.com/conda-forge/feedstock-builds/_build/latest?definitionId=10822&branchName=master) [![codecov](https://codecov.io/gh/tvdboom/ATOM/branch/master/graph/badge.svg)](https://codecov.io/gh/tvdboom/ATOM)
diff --git a/atom/api.py b/atom/api.py
index 8eea06ddf..2cbba7db3 100644
--- a/atom/api.py
+++ b/atom/api.py
@@ -18,8 +18,8 @@
from atom.atom import ATOM
from atom.basetransformer import BaseTransformer
from atom.utils.types import (
- BACKEND, BOOL, ENGINE, GOAL, INDEX_SELECTOR, INT, PREDICTOR, SCALAR,
- TARGET,
+ BACKEND, BOOL, ENGINE, INDEX_SELECTOR, INT, PREDICTOR, SCALAR,
+ TARGET, WARNINGS,
)
@@ -160,7 +160,6 @@ class ATOMClassifier(BaseTransformer, ATOM):
y: int, str, dict, sequence or dataframe, default=-1
Target column corresponding to X.
- - If None: y is ignored.
- If int: Position of the target column in X.
- If str: Name of the target column in X.
- If sequence: Target array with shape=(n_samples,) or
@@ -336,7 +335,7 @@ def __init__(
engine: ENGINE = {"data": "numpy", "estimator": "sklearn"},
backend: BACKEND = "loky",
verbose: Literal[0, 1, 2] = 0,
- warnings: BOOL | str = False,
+ warnings: BOOL | WARNINGS = False,
logger: str | Logger | None = None,
experiment: str | None = None,
random_state: INT | None = None,
@@ -353,7 +352,7 @@ def __init__(
random_state=random_state,
)
- self.goal: GOAL = "class"
+ self.goal = "class"
ATOM.__init__(
self,
arrays=arrays,
@@ -555,7 +554,7 @@ def __init__(
engine: ENGINE = {"data": "numpy", "estimator": "sklearn"},
backend: BACKEND = "loky",
verbose: Literal[0, 1, 2] = 0,
- warnings: BOOL | str = False,
+ warnings: BOOL | WARNINGS = False,
logger: str | Logger | None = None,
experiment: str | None = None,
random_state: INT | None = None,
@@ -572,7 +571,7 @@ def __init__(
random_state=random_state,
)
- self.goal: GOAL = "fc"
+ self.goal = "fc"
ATOM.__init__(
self,
arrays=arrays,
@@ -790,7 +789,7 @@ def __init__(
engine: ENGINE = {"data": "numpy", "estimator": "sklearn"},
backend: BACKEND = "loky",
verbose: Literal[0, 1, 2] = 0,
- warnings: BOOL | str = False,
+ warnings: BOOL | WARNINGS = False,
logger: str | Logger | None = None,
experiment: str | None = None,
random_state: INT | None = None,
@@ -807,7 +806,7 @@ def __init__(
random_state=random_state,
)
- self.goal: GOAL = "reg"
+ self.goal = "reg"
ATOM.__init__(
self,
arrays=arrays,
diff --git a/atom/atom.py b/atom/atom.py
index 817f51fb8..73127f3e9 100644
--- a/atom/atom.py
+++ b/atom/atom.py
@@ -36,9 +36,7 @@
)
from atom.models import MODELS
from atom.nlp import TextCleaner, TextNormalizer, Tokenizer, Vectorizer
-from atom.plots import (
- DataPlot, FeatureSelectorPlot, HTPlot, PredictionPlot, ShapPlot,
-)
+from atom.plots import ATOMPlot
from atom.training import (
DirectClassifier, DirectForecaster, DirectRegressor,
SuccessiveHalvingClassifier, SuccessiveHalvingForecaster,
@@ -47,9 +45,10 @@
)
from atom.utils.constants import MISSING_VALUES, __version__
from atom.utils.types import (
- BOOL, DATAFRAME, DATASET, FEATURES, INDEX, INDEX_SELECTOR, INT,
- METRIC_SELECTOR, PANDAS, PREDICTOR, RUNNER, SCALAR, SEQUENCE, SERIES,
- SLICE, TARGET, TRANSFORMER, TS_INDEX_TYPES,
+ BOOL, DATAFRAME, DATASET, DISCRETIZER_STRATS, ESTIMATOR, FEATURES, INDEX,
+ INDEX_SELECTOR, INT, METRIC_SELECTOR, PANDAS, PREDICTOR, PRUNER_STRATS,
+ RUNNER, SCALAR, SCALER_STRATS, SEQUENCE, SERIES, SLICE, STRAT_NUM, TARGET,
+ TRANSFORMER, TS_INDEX_TYPES,
)
from atom.utils.utils import (
ClassMap, DataConfig, check_dependency, check_is_fitted, check_scaling,
@@ -60,7 +59,7 @@
@typechecked
-class ATOM(BaseRunner, FeatureSelectorPlot, DataPlot, HTPlot, PredictionPlot, ShapPlot):
+class ATOM(BaseRunner, ATOMPlot):
"""ATOM base class.
The ATOM class is a convenient wrapper for all data cleaning,
@@ -160,7 +159,7 @@ def __repr__(self) -> str:
return out
- def __iter__(self) -> TRANSFORMER:
+ def __iter__(self) -> TRANSFORMER | None:
yield from self.pipeline.values
# Utility properties =========================================== >>
@@ -545,7 +544,7 @@ def inverse_transform(
y: TARGET | None = None,
*,
verbose: INT | None = None,
- ) -> PANDAS | tuple[DATAFRAME, SERIES]:
+ ) -> PANDAS | tuple[DATAFRAME, PANDAS]:
"""Inversely transform new data through the pipeline.
Transformers that are only applied on the training set are
@@ -898,7 +897,7 @@ def get_data(new_t: str) -> SERIES:
get_data(r[0]) for r in t if r[1] <= column.min() and r[2] >= column.max()
)
- if self.engine["data"] == "pyarrow":
+ if self.engine.get("data") == "pyarrow":
self.branch.dataset = self.branch.dataset.astype(
{name: to_pyarrow(col) for name, col in self.branch._data.items()}
)
@@ -986,7 +985,7 @@ def transform(
y: TARGET | None = None,
*,
verbose: INT | None = None,
- ) -> PANDAS | tuple[DATAFRAME, SERIES]:
+ ) -> PANDAS | tuple[DATAFRAME, PANDAS]:
"""Transform new data through the pipeline.
Transformers that are only applied on the training set are
@@ -1068,7 +1067,7 @@ def _add_transformer(
self,
transformer: TRANSFORMER,
columns: SLICE | None = None,
- train_only: bool = False,
+ train_only: BOOL = False,
**fit_params,
):
"""Add a transformer to the pipeline.
@@ -1106,9 +1105,6 @@ def _add_transformer(
"new branch to continue the pipeline."
)
- if not hasattr(transformer, "transform"):
- raise AttributeError("Added transformers should have a transform method!")
-
# Add BaseTransformer params to the estimator if left to default
transformer = self._inherit(transformer)
@@ -1160,7 +1156,7 @@ def add(
transformer: TRANSFORMER,
*,
columns: SLICE | None = None,
- train_only: bool = False,
+ train_only: BOOL = False,
**fit_params,
):
"""Add a transformer to the pipeline.
@@ -1249,9 +1245,8 @@ def apply(
):
"""Apply a function to the dataset.
- The function should have signature `func(dataset, **kw_args) ->
- dataset`. This method is useful for stateless transformations
- such as taking the log, doing custom scaling, etc...
+ This method is useful for stateless transformations such as
+ taking the log, doing custom scaling, etc...
!!! note
This approach is preferred over changing the dataset directly
@@ -1265,7 +1260,8 @@ def apply(
Parameters
----------
func: callable
- Function to apply.
+ Function to apply with signature `func(dataset, **kw_args) ->
+ dataset`.
inverse_func: callable or None, default=None
Inverse function of `func`. If None, the inverse_transform
@@ -1336,13 +1332,13 @@ def balance(self, strategy: str = "adasyn", **kwargs):
def clean(
self,
*,
- convert_dtypes: bool = True,
+ convert_dtypes: BOOL = True,
drop_dtypes: str | SEQUENCE | None = None,
drop_chars: str | None = None,
- strip_categorical: bool = True,
- drop_duplicates: bool = False,
- drop_missing_target: bool = True,
- encode_target: bool = True,
+ strip_categorical: BOOL = True,
+ drop_duplicates: BOOL = False,
+ drop_missing_target: BOOL = True,
+ encode_target: BOOL = True,
**kwargs,
):
"""Applies standard data cleaning steps on the dataset.
@@ -1382,7 +1378,7 @@ def clean(
@composed(crash, method_to_log)
def discretize(
self,
- strategy: str = "quantile",
+ strategy: DISCRETIZER_STRATS = "quantile",
*,
bins: INT | SEQUENCE | dict = 5,
labels: SEQUENCE | dict | None = None,
@@ -1467,7 +1463,7 @@ def encode(
@composed(crash, method_to_log)
def impute(
self,
- strat_num: SCALAR | Literal["drop", "mean", "knn", "most_frequent"] = "drop",
+ strat_num: STRAT_NUM = "drop",
strat_cat: Literal["drop", "most_frequent"] | str = "drop",
*,
max_nan_rows: SCALAR | None = None,
@@ -1539,11 +1535,11 @@ def normalize(
@composed(crash, method_to_log)
def prune(
self,
- strategy: str | SEQUENCE = "zscore",
+ strategy: PRUNER_STRATS | SEQUENCE = "zscore",
*,
method: SCALAR | Literal["drop", "minmax"] = "drop",
max_sigma: SCALAR = 3,
- include_target: bool = False,
+ include_target: BOOL = False,
**kwargs,
):
"""Prune outliers from the training set.
@@ -1581,7 +1577,12 @@ def prune(
setattr(self.branch, strat.lower(), getattr(pruner, strat.lower()))
@composed(crash, method_to_log)
- def scale(self, strategy: str = "standard", include_binary: bool = False, **kwargs):
+ def scale(
+ self,
+ strategy: SCALER_STRATS = "standard",
+ include_binary: BOOL = False,
+ **kwargs,
+ ):
"""Scale the data.
Apply one of sklearn's scalers. Categorical columns are ignored.
@@ -1611,19 +1612,19 @@ def scale(self, strategy: str = "standard", include_binary: bool = False, **kwar
def textclean(
self,
*,
- decode: bool = True,
- lower_case: bool = True,
- drop_email: bool = True,
+ decode: BOOL = True,
+ lower_case: BOOL = True,
+ drop_email: BOOL = True,
regex_email: str | None = None,
- drop_url: bool = True,
+ drop_url: BOOL = True,
regex_url: str | None = None,
- drop_html: bool = True,
+ drop_html: BOOL = True,
regex_html: str | None = None,
- drop_emoji: bool = True,
+ drop_emoji: BOOL = True,
regex_emoji: str | None = None,
- drop_number: bool = True,
+ drop_number: BOOL = True,
regex_number: str | None = None,
- drop_punctuation: bool = True,
+ drop_punctuation: BOOL = True,
**kwargs,
):
"""Applies standard text cleaning to the corpus.
@@ -1664,10 +1665,10 @@ def textclean(
def textnormalize(
self,
*,
- stopwords: bool | str = True,
+ stopwords: BOOL | str = True,
custom_stopwords: SEQUENCE | None = None,
- stem: bool | str = False,
- lemmatize: bool = True,
+ stem: BOOL | str = False,
+ lemmatize: BOOL = True,
**kwargs,
):
"""Normalize the corpus.
@@ -1727,7 +1728,13 @@ def tokenize(
self.branch.quadgrams = tokenizer.quadgrams
@composed(crash, method_to_log)
- def vectorize(self, strategy: str = "bow", *, return_sparse: bool = True, **kwargs):
+ def vectorize(
+ self,
+ strategy: Literal["bow", "tfidf", "hashing"] = "bow",
+ *,
+ return_sparse: BOOL = True,
+ **kwargs,
+ ):
"""Vectorize the corpus.
Transform the corpus into meaningful vectors of numbers. The
@@ -1766,7 +1773,7 @@ def feature_extraction(
fmt: str | SEQUENCE | None = None,
*,
encoding_type: str = "ordinal",
- drop_columns: bool = True,
+ drop_columns: BOOL = True,
**kwargs,
):
"""Extract features from datetime columns.
@@ -1831,7 +1838,7 @@ def feature_grouping(
group: dict[str, str | SEQUENCE],
*,
operators: str | SEQUENCE | None = None,
- drop_columns: bool = True,
+ drop_columns: BOOL = True,
**kwargs,
):
"""Extract statistics from similar features.
@@ -1862,7 +1869,7 @@ def feature_selection(
self,
strategy: str | None = None,
*,
- solver: str | Callable | None = None,
+ solver: str | ESTIMATOR | None = None,
n_features: SCALAR | None = None,
min_repeated: SCALAR | None = 2,
max_repeated: SCALAR | None = 1.0,
@@ -2005,7 +2012,7 @@ def run(
n_trials: INT | dict | SEQUENCE = 0,
ht_params: dict | None = None,
n_bootstrap: INT | SEQUENCE = 0,
- parallel: bool = False,
+ parallel: BOOL = False,
errors: Literal["raise", "skip", "keep"] = "skip",
**kwargs,
):
@@ -2061,7 +2068,7 @@ def successive_halving(
n_trials: INT | dict | SEQUENCE = 0,
ht_params: dict | None = None,
n_bootstrap: INT | dict | SEQUENCE = 0,
- parallel: bool = False,
+ parallel: BOOL = False,
errors: Literal["raise", "skip", "keep"] = "skip",
**kwargs,
):
@@ -2124,7 +2131,7 @@ def train_sizing(
n_trials: INT | dict | SEQUENCE = 0,
ht_params: dict | None = None,
n_bootstrap: INT | dict | SEQUENCE = 0,
- parallel: bool = False,
+ parallel: BOOL = False,
errors: Literal["raise", "skip", "keep"] = "skip",
**kwargs,
):
diff --git a/atom/basemodel.py b/atom/basemodel.py
index 1060c7421..723c128da 100644
--- a/atom/basemodel.py
+++ b/atom/basemodel.py
@@ -17,7 +17,7 @@
from logging import Logger
from typing import Any, Callable, Literal
from unittest.mock import patch
-
+from typeguard import TypeCheckError
import dill as pickle
import mlflow
import numpy as np
@@ -56,12 +56,12 @@
from atom.basetransformer import BaseTransformer
from atom.data_cleaning import Scaler
from atom.pipeline import Pipeline
-from atom.plots import HTPlot, PredictionPlot, ShapPlot
+from atom.plots import RunnerPlot
from atom.utils.constants import DF_ATTRS
from atom.utils.types import (
BOOL, BRANCH, DATAFRAME, DATAFRAME_TYPES, ENGINE, FEATURES, FLOAT,
- FLOAT_TYPES, GOAL, INDEX, INT, INT_TYPES, METRIC_SELECTOR, PANDAS,
- PREDICTOR, SCALAR, SCORER, SEQUENCE, SERIES, SLICE, TARGET,
+ FLOAT_TYPES, INDEX, INT, INT_TYPES, METRIC_SELECTOR, PANDAS,
+ PREDICTOR, SCALAR, SCORER, SEQUENCE, SERIES, SLICE, TARGET, WARNINGS,
)
from atom.utils.utils import (
ClassMap, CustomDict, DataConfig, PlotCallback, ShapExplanation,
@@ -75,7 +75,7 @@
@typechecked
-class BaseModel(BaseTransformer, BaseTracker, HTPlot, PredictionPlot, ShapPlot):
+class BaseModel(BaseTransformer, BaseTracker, RunnerPlot):
"""Base class for all models.
Parameters
@@ -174,7 +174,7 @@ class BaseModel(BaseTransformer, BaseTracker, HTPlot, PredictionPlot, ShapPlot):
def __init__(
self,
name: str | None = None,
- goal: GOAL = "class",
+ goal: Literal["class", "reg", "fc"] = "class",
config: DataConfig | None = None,
og: BRANCH | None = None,
branch: BRANCH | None = None,
@@ -184,7 +184,7 @@ def __init__(
engine: ENGINE = {"data": "numpy", "estimator": "sklearn"},
backend: str = "loky",
verbose: Literal[0, 1, 2] = 0,
- warnings: BOOL | str = False,
+ warnings: BOOL | WARNINGS = False,
logger: str | Logger | None = None,
experiment: str | None = None,
random_state: INT | None = None,
@@ -276,16 +276,12 @@ def _fullname(self) -> str:
"""Return the model's class name."""
return self.__class__.__name__
- @property
- def _gpu(self) -> BOOL:
- """Return whether the model uses a GPU implementation."""
- return "gpu" in self.device.lower()
-
@property
def _est_class(self) -> PREDICTOR:
"""Return the estimator's class (not instance)."""
try:
- module = import_module(f"{self.engine['estimator']}.{self._module}")
+ engine = self.engine.get("estimator", "sklearn")
+ module = import_module(f"{engine}.{self._module}")
cls = self._estimators.get(self.goal, self._estimators.get("reg"))
except (ModuleNotFoundError, AttributeError):
if "sklearn" in self.supports_engines:
@@ -442,9 +438,9 @@ def _get_est(self, **params) -> PREDICTOR:
def _fit_estimator(
self,
estimator: PREDICTOR,
- data: tuple[DATAFRAME, SERIES],
+ data: tuple[DATAFRAME, PANDAS],
est_params_fit: dict,
- validation: tuple[DATAFRAME, SERIES] | None = None,
+ validation: tuple[DATAFRAME, PANDAS] | None = None,
trial: Trial | None = None,
) -> PREDICTOR:
"""Fit the estimator and perform in-training validation.
@@ -581,7 +577,7 @@ def _final_output(self) -> str:
if (1.2 if score_train < 0 else 0.8) * score_train > score_test:
out += " ~"
- except AttributeError: # Fails when model failed but errors="keep"
+ except TypeCheckError: # Fails when model failed but errors="keep"
out = "FAIL"
return out
@@ -692,7 +688,7 @@ def _score_from_pred(
y_true: PANDAS,
y_pred: PANDAS,
**kwargs,
- ) -> FLOAT:
+ ) -> SCALAR:
"""Calculate the metric score from predicted values.
Since sklearn metrics don't support multiclass-multioutput
@@ -715,7 +711,7 @@ def _score_from_pred(
Returns
-------
- float
+ int or float
Calculated score.
"""
@@ -740,7 +736,7 @@ def _get_score(
dataset: str,
threshold: tuple[FLOAT] | None = None,
sample_weight: tuple | None = None,
- ) -> FLOAT:
+ ) -> SCALAR:
"""Calculate a metric score using the prediction attributes.
The method results are cached to avoid recalculation of the
@@ -771,7 +767,7 @@ def _get_score(
Returns
-------
- float
+ int or float
Metric score on the selected data set.
"""
@@ -886,7 +882,7 @@ def fit_model(
y_val = self.og.y_train.iloc[val_idx]
# Transform subsets if there is a pipeline
- if len(pl := self.export_pipeline(verbose=0)[:-1]) > 0:
+ if len(pl := export_pipeline(self.pipeline, verbose=0)) > 0:
X_subtrain, y_subtrain = pl.fit_transform(X_subtrain, y_subtrain)
X_val, y_val = pl.transform(X_val, y_val)
@@ -1401,17 +1397,17 @@ def evals(self) -> CustomDict:
return self._evals
@property
- def score_train(self) -> FLOAT | list[FLOAT]:
+ def score_train(self) -> SCALAR | list[SCALAR]:
"""Metric score on the training set."""
return flt([self._get_score(m, "train") for m in self._metric])
@property
- def score_test(self) -> FLOAT | list[FLOAT]:
+ def score_test(self) -> SCALAR | list[SCALAR]:
"""Metric score on the test set."""
return flt([self._get_score(m, "test") for m in self._metric])
@property
- def score_holdout(self) -> FLOAT | list[FLOAT]:
+ def score_holdout(self) -> SCALAR | list[SCALAR]:
"""Metric score on the holdout set."""
return flt([self._get_score(m, "holdout") for m in self._metric])
@@ -1433,7 +1429,7 @@ def bootstrap(self) -> pd.DataFrame | None:
return self._bootstrap
@property
- def score_bootstrap(self) -> FLOAT | list[FLOAT] | None:
+ def score_bootstrap(self) -> SCALAR | list[SCALAR] | None:
"""Mean metric score on the bootstrapped samples."""
if self.bootstrap is not None:
return flt(self.bootstrap.mean().tolist())
@@ -2141,7 +2137,7 @@ def inverse_transform(
y: TARGET | None = None,
*,
verbose: INT | None = None,
- ) -> PANDAS | tuple[DATAFRAME, SERIES]:
+ ) -> PANDAS | tuple[DATAFRAME, PANDAS]:
"""Inversely transform new data through the pipeline.
Transformers that are only applied on the training set are
@@ -2200,7 +2196,7 @@ def register(
self,
name: str | None = None,
stage: str = "None",
- archive_existing_versions: bool = False,
+ archive_existing_versions: BOOL = False,
):
"""Register the model in [mlflow's model registry][registry].
@@ -2340,7 +2336,7 @@ def transform(
y: TARGET | None = None,
*,
verbose: INT | None = None,
- ) -> PANDAS | tuple[DATAFRAME, SERIES]:
+ ) -> PANDAS | tuple[DATAFRAME, PANDAS]:
"""Transform new data through the pipeline.
Transformers that are only applied on the training set are
@@ -3490,7 +3486,7 @@ def predict_proba(
self,
fh: int | SEQUENCE | ForecastingHorizon,
X: FEATURES | None = None,
- marginal: bool = True,
+ marginal: BOOL = True,
verbose: INT | None = None,
) -> Normal:
"""Get probabilistic forecasts on new data or existing rows.
@@ -3624,7 +3620,7 @@ def predict_var(
self,
fh: int | SEQUENCE | ForecastingHorizon,
X: FEATURES | None = None,
- cov: bool = False,
+ cov: BOOL = False,
verbose: INT | None = None,
) -> DATAFRAME:
"""Get probabilistic forecasts on new data or existing rows.
diff --git a/atom/baserunner.py b/atom/baserunner.py
index 758553df0..7d66f424e 100644
--- a/atom/baserunner.py
+++ b/atom/baserunner.py
@@ -97,13 +97,10 @@ def __len__(self) -> int:
return len(self.dataset)
def __contains__(self, item: str) -> BOOL:
- if self.dataset is None:
- return False
- else:
- return item in self.dataset
+ return item in self.dataset
def __getitem__(self, item: INT | str | list) -> Any:
- if self.dataset is None:
+ if self.dataset.empty:
raise RuntimeError(
"This instance has no dataset annexed to it. "
"Use the run method before calling __getitem__."
@@ -122,18 +119,13 @@ def __getitem__(self, item: INT | str | list) -> Any:
f"{self.__class__.__name__} object has no "
f"branch, model or column called {item}."
)
- elif isinstance(item, list):
- return self.dataset[item] # Get subset of dataset
else:
- raise TypeError(
- f"{self.__class__.__name__} is only "
- "subscriptable with types int, str or list."
- )
+ return self.dataset[item] # Get subset of dataset
# Utility properties =========================================== >>
@property
- def og(self) -> Branch:
+ def og(self) -> BRANCH:
"""Branch containing the original dataset.
This branch contains the data prior to any transformations.
@@ -144,7 +136,7 @@ def og(self) -> Branch:
return self._og or self.branch
@property
- def branch(self) -> Branch:
+ def branch(self) -> BRANCH:
"""Current active branch.
Use the property's `@setter` to change the branch or to create
diff --git a/atom/basetracker.py b/atom/basetracker.py
index 6a919a637..8cb73fe9c 100644
--- a/atom/basetracker.py
+++ b/atom/basetracker.py
@@ -7,6 +7,8 @@
"""
+from __future__ import annotations
+
from dataclasses import dataclass
from typeguard import typechecked
diff --git a/atom/basetrainer.py b/atom/basetrainer.py
index 6b26fc6f9..0c94f89d3 100644
--- a/atom/basetrainer.py
+++ b/atom/basetrainer.py
@@ -13,7 +13,7 @@
import traceback
from datetime import datetime as dt
from typing import Any
-
+from typeguard import TypeCheckError
import joblib
import mlflow
import numpy as np
@@ -28,7 +28,7 @@
from atom.branch import Branch
from atom.data_cleaning import BaseTransformer
from atom.models import MODELS, CustomModel
-from atom.plots import HTPlot, PredictionPlot, ShapPlot
+from atom.plots import RunnerPlot
from atom.utils.types import MODEL, SEQUENCE_TYPES
from atom.utils.utils import (
ClassMap, DataConfig, check_dependency, get_best_score, get_custom_scorer,
@@ -37,7 +37,7 @@
@typechecked
-class BaseTrainer(BaseTransformer, BaseRunner, HTPlot, PredictionPlot, ShapPlot):
+class BaseTrainer(BaseTransformer, BaseRunner, RunnerPlot):
"""Base class for trainers.
Implements methods to check the validity of the parameters,
@@ -432,7 +432,7 @@ def execute_model(m: MODEL) -> MODEL | None:
try:
scores.append(get_best_score(model))
- except AttributeError: # Fails when model failed but errors="keep"
+ except TypeCheckError: # Fails when model failed but errors="keep"
scores.append(-np.inf)
maxlen = max(maxlen, len(names[-1]))
diff --git a/atom/basetransformer.py b/atom/basetransformer.py
index edac1266a..c94b0df4a 100644
--- a/atom/basetransformer.py
+++ b/atom/basetransformer.py
@@ -18,7 +18,7 @@
from importlib.util import find_spec
from logging import DEBUG, FileHandler, Formatter, Logger, getLogger
from multiprocessing import cpu_count
-from typing import Any, Callable
+from typing import Any, Callable, Literal
import dagshub
import dill as pickle
@@ -33,8 +33,8 @@
from typeguard import typechecked
from atom.utils.types import (
- BOOL, DATAFRAME, DATAFRAME_TYPES, FEATURES, INDEX, INT, INT_TYPES, PANDAS,
- PREDICTOR, SCALAR, SEQUENCE, SEQUENCE_TYPES, TARGET,
+ BACKEND, BOOL, DATAFRAME, DATAFRAME_TYPES, ENGINE, ESTIMATOR, FEATURES,
+ INT, INT_TYPES, PANDAS, SCALAR, SEQUENCE, SEQUENCE_TYPES, TARGET, WARNINGS,
)
from atom.utils.utils import (
bk, composed, crash, get_cols, lst, merge, method_to_log, n_cols, pd, sign,
@@ -101,7 +101,8 @@ def n_jobs(self, value: INT):
# Final check for negative input
if value < 1:
raise ValueError(
- f"Invalid value for the n_jobs parameter, got {value}.", 1
+ "Invalid value for the n_jobs parameter, "
+ f"got {value}. Value should be >=0.", 1
)
self._n_jobs = value
@@ -118,92 +119,55 @@ def device(self, value: str):
os.environ["CUDA_VISIBLE_DEVICES"] = str(self._device_id)
@property
- def engine(self) -> dict:
+ def engine(self) -> ENGINE:
"""Execution engine for estimators."""
return self._engine
@engine.setter
- def engine(self, value: dict | None):
- if not value:
- value = {"data": "numpy", "estimator": "sklearn"}
- elif "data" not in value and "estimator" not in value:
- raise ValueError(
- f"Invalid value for the engine parameter, got {value}. "
- "The value should be a dict with keys 'data' and/or 'estimator'."
+ def engine(self, value: ENGINE):
+ if value.get("data") == "modin" and not ray.is_initialized():
+ ray.init(
+ runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}},
+ log_to_driver=False,
)
- if data := value.get("data"):
- if data.lower() == "modin":
- if not ray.is_initialized():
- ray.init(
- runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}},
- log_to_driver=False,
- )
- elif data.lower() not in ("numpy", "pyarrow"):
- raise ValueError(
- "Invalid value for the data key of the engine parameter, "
- f"got {data}. Choose from: numpy, pyarrow, modin."
- )
- else:
- value["data"] = "numpy"
-
# Update env variable to use for PandasModin in utils.py
- os.environ["ATOM_DATA_ENGINE"] = value["data"].lower()
-
- if models := value.get("estimator"):
- device = self.device.lower()
-
- if models.lower() == "sklearnex":
- if not find_spec("sklearnex"):
- raise ModuleNotFoundError(
- "Failed to import scikit-learn-intelex. The library is "
- "not installed. Note that the library only supports CPUs "
- "with a x86 architecture."
- )
- else:
- import sklearnex
- sklearnex.set_config(device if "gpu" in device else "auto")
- elif models.lower() == "cuml":
- if not find_spec("cuml"):
- raise ModuleNotFoundError(
- "Failed to import cuml. Package is not installed. Refer "
- "to: https://rapids.ai/start.html#install."
- )
- else:
- from cuml.common.device_selection import (
- set_global_device_type,
- )
- set_global_device_type("gpu" if "gpu" in device else "cpu")
-
- # See https://github.com/rapidsai/cuml/issues/5564
- from cuml.internals.memory_utils import (
- set_global_output_type,
- )
- set_global_output_type("numpy")
-
- elif models.lower() != "sklearn":
- raise ValueError(
- "Invalid value for the models key of the engine parameter, "
- f"got {models}. Choose from: sklearn, sklearnex, cuml."
+ os.environ["ATOM_DATA_ENGINE"] = value.get("data", "numpy")
+
+ if value.get("estimator") == "sklearnex":
+ if not find_spec("sklearnex"):
+ raise ModuleNotFoundError(
+ "Failed to import scikit-learn-intelex. The library is "
+ "not installed. Note that the library only supports CPUs "
+ "with a x86 architecture."
)
- else:
- value["estimator"] = "sklearn"
+ else:
+ import sklearnex
+ sklearnex.set_config(self.device.lower() if self._gpu else "auto")
+ elif value.get("estimator") == "cuml":
+ if not find_spec("cuml"):
+ raise ModuleNotFoundError(
+ "Failed to import cuml. Package is not installed. Refer "
+ "to: https://rapids.ai/start.html#install."
+ )
+ else:
+ from cuml.common.device_selection import set_global_device_type
+ set_global_device_type("gpu" if self._gpu else "cpu")
+
+ # See https://github.com/rapidsai/cuml/issues/5564
+ from cuml.internals.memory_utils import set_global_output_type
+ set_global_output_type("numpy")
self._engine = value
@property
- def backend(self) -> str:
+ def backend(self) -> BACKEND:
"""Parallelization backend."""
return self._backend
@backend.setter
- def backend(self, value: str):
- if value.lower() not in (opts := ("loky", "multiprocessing", "threading", "ray")):
- raise ValueError(
- f"Invalid value for the backend parameter, got "
- f"{value}. Choose from: {', '.join(opts)}."
- )
- elif value.lower() == "ray":
+ def backend(self, value: BACKEND):
+ if value == "ray":
register_ray() # Register ray as joblib backend
if not ray.is_initialized():
ray.init(log_to_driver=False)
@@ -211,35 +175,24 @@ def backend(self, value: str):
self._backend = value
@property
- def verbose(self) -> INT:
+ def verbose(self) -> Literal[0, 1, 2]:
"""Verbosity level of the output."""
return self._verbose
@verbose.setter
- def verbose(self, value: INT):
- if value < 0 or value > 2:
- raise ValueError(
- "Invalid value for the verbose parameter. Value"
- f" should be between 0 and 2, got {value}."
- )
+ def verbose(self, value: Literal[0, 1, 2]):
self._verbose = value
@property
- def warnings(self) -> str:
+ def warnings(self) -> WARNINGS:
"""Whether to show or suppress encountered warnings."""
return self._warnings
@warnings.setter
- def warnings(self, value: BOOL | str):
+ def warnings(self, value: BOOL | WARNINGS):
if isinstance(value, BOOL):
self._warnings = "default" if value else "ignore"
else:
- options = ("default", "error", "ignore", "always", "module", "once")
- if value not in options:
- raise ValueError(
- "Invalid value for the warnings parameter, got "
- f"{value}. Choose from: {', '.join(options)}."
- )
self._warnings = value
warnings.filterwarnings(self._warnings) # Change the filter in this process
@@ -336,7 +289,7 @@ def experiment(self, value: str | None):
mlflow.set_experiment(value)
@property
- def random_state(self) -> INT:
+ def random_state(self) -> INT | None:
"""Seed used by the random number generator."""
return self._random_state
@@ -351,6 +304,11 @@ def random_state(self, value: INT | None):
np.random.seed(value)
self._random_state = value
+ @property
+ def _gpu(self) -> BOOL:
+ """Return whether the instance uses a GPU implementation."""
+ return "gpu" in self.device.lower()
+
@property
def _device_id(self) -> int:
"""Which GPU device to use."""
@@ -392,7 +350,7 @@ def _inherit(self, obj: Any) -> Any:
return obj
- def _get_est_class(self, name: str, module: str) -> PREDICTOR:
+ def _get_est_class(self, name: str, module: str) -> ESTIMATOR:
"""Import a class from a module.
When the import fails, for example if atom uses sklearnex and
@@ -408,12 +366,13 @@ def _get_est_class(self, name: str, module: str) -> PREDICTOR:
Returns
-------
- Predictor
+ Estimator
Class of the estimator.
"""
try:
- return getattr(import_module(f"{self.engine['estimator']}.{module}"), name)
+ engine = self.engine.get("estimator", "sklearn")
+ return getattr(import_module(f"{engine}.{module}"), name)
except (ModuleNotFoundError, AttributeError):
return getattr(import_module(f"sklearn.{module}"), name)
@@ -925,7 +884,7 @@ def _has_data_sets(
if self.goal == "fc" and not isinstance(y, (INT, str)):
# arrays=() and y=y for forecasting
sets = _no_data_sets(*self._prepare_input(y=y))
- elif self.branch._data is None:
+ elif self.branch._data.empty:
raise ValueError(
"The data arrays are empty! Provide the data to run the pipeline "
"successfully. See the documentation for the allowed formats."
@@ -1042,7 +1001,7 @@ def log(self, msg: SCALAR | str, level: INT = 0, severity: str = "info"):
getattr(self.logger, severity)(str(text))
@composed(crash, method_to_log)
- def save(self, filename: str = "auto", *, save_data: bool = True):
+ def save(self, filename: str = "auto", *, save_data: BOOL = True):
"""Save the instance to a pickle file.
Parameters
diff --git a/atom/branch.py b/atom/branch.py
index 423d0ce28..9abf44789 100644
--- a/atom/branch.py
+++ b/atom/branch.py
@@ -43,8 +43,8 @@ class Branch:
name: str
Name of the branch.
- data: dataframe or None, default=None
- Complete dataset.
+ data: dataframe, default=pd.DataFrame()
+ Complete dataset. Defaults to an empty frame if not provided.
index: list or None, default=None
A list containing the number of target columns, the indices of
@@ -61,7 +61,7 @@ class Branch:
def __init__(
self,
name: str,
- data: DATAFRAME | None = None,
+ data: DATAFRAME = pd.DataFrame(),
index: list[INT, INDEX, INDEX] | None = None,
holdout: DATAFRAME | None = None,
parent: BRANCH | None = None,
@@ -69,7 +69,7 @@ def __init__(
self._data = data
self._idx = index
self._holdout = holdout
- self._pipeline = pd.Series(data=[], dtype="object")
+ self._pipeline = pd.Series(dtype="object")
self._mapping = CustomDict()
# If a parent branch is provided, transfer its attrs to this one
@@ -87,7 +87,7 @@ def __repr__(self) -> str:
return f"Branch({self.name})"
def __bool__(self):
- return self._data is not None
+ return not self._data.empty
@property
def name(self) -> str:
@@ -172,7 +172,7 @@ def counter(name: str, dim: str) -> str:
value = to_pandas(
data=value,
index=side.index if side_name else None,
- name=getattr(under, "name", None) if under_name else None,
+ name=getattr(under, "name", None) if under_name else "target",
columns=getattr(under, "columns", None) if under_name else None,
dtype=under.dtypes if under_name else None,
)
diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py
index 4bcdbca48..59390a36b 100644
--- a/atom/data_cleaning.py
+++ b/atom/data_cleaning.py
@@ -47,8 +47,9 @@
from atom.basetransformer import BaseTransformer
from atom.utils.constants import MISSING_VALUES
from atom.utils.types import (
- BOOL, DATAFRAME, DATAFRAME_TYPES, ENGINE, ESTIMATOR, FEATURES, FLOAT, INT,
- PANDAS, SCALAR, SEQUENCE, SEQUENCE_TYPES, SERIES_TYPES, TARGET,
+ BOOL, DATAFRAME, DATAFRAME_TYPES, DISCRETIZER_STRATS, ENGINE, ESTIMATOR,
+ FEATURES, FLOAT, INT, PANDAS, PRUNER_STRATS, SCALAR, SCALER_STRATS,
+ SEQUENCE, SEQUENCE_TYPES, SERIES_TYPES, STRAT_NUM, TARGET,
)
from atom.utils.utils import (
CustomDict, bk, check_is_fitted, composed, crash, get_cols, it, lst, merge,
@@ -1082,7 +1083,7 @@ class Discretizer(BaseEstimator, TransformerMixin, BaseTransformer):
def __init__(
self,
- strategy: str = "quantile",
+ strategy: DISCRETIZER_STRATS = "quantile",
*,
bins: INT | SEQUENCE | dict = 5,
labels: SEQUENCE | dict | None = None,
@@ -1151,12 +1152,6 @@ def get_labels(labels, bins):
self._check_n_features(X, reset=True)
self._num_cols = list(X.select_dtypes(include="number"))
- if self.strategy.lower() not in ("uniform", "quantile", "kmeans", "custom"):
- raise ValueError(
- f"Invalid value for the strategy parameter, got {self.strategy}. "
- "Choose from: uniform, quantile, kmeans, custom."
- )
-
self.log("Fitting Discretizer...", 1)
labels = {} if self.labels is None else self.labels
@@ -1173,7 +1168,7 @@ def get_labels(labels, bins):
else:
bins = self.bins
- if self.strategy.lower() != "custom":
+ if self.strategy != "custom":
if isinstance(bins, SEQUENCE_TYPES):
try:
bins = bins[i] # Fetch the i-th bin for the i-th column
@@ -1186,15 +1181,16 @@ def get_labels(labels, bins):
estimator = self._get_est_class("KBinsDiscretizer", "preprocessing")
- # cuML implementation has no random_state
+ # cuML implementation has no subsample and random_state
kwargs = {}
- if "random_state" in sign(estimator):
+ if "subsample" in sign(estimator):
+ kwargs["subsample"] = 200000
kwargs["random_state"] = self.random_state
self._discretizers[col] = estimator(
n_bins=bins,
encode="ordinal",
- strategy=self.strategy.lower(),
+ strategy=self.strategy,
**kwargs,
).fit(X[[col]])
@@ -1806,7 +1802,7 @@ class Imputer(BaseEstimator, TransformerMixin, BaseTransformer):
def __init__(
self,
- strat_num: SCALAR | Literal["drop", "mean", "knn", "most_frequent"] = "drop",
+ strat_num: STRAT_NUM = "drop",
strat_cat: Literal["drop", "most_frequent"] | str = "drop",
*,
max_nan_rows: SCALAR | None = None,
@@ -1853,12 +1849,6 @@ def fit(self, X: FEATURES, y: TARGET | None = None) -> Imputer:
self._num_cols = list(X.select_dtypes(include="number"))
# Check input Parameters
- strategies = ["drop", "mean", "median", "knn", "most_frequent"]
- if isinstance(self.strat_num, str) and self.strat_num.lower() not in strategies:
- raise ValueError(
- "Unknown strategy for the strat_num parameter, got "
- f"{self.strat_num}. Choose from: {', '.join(strategies)}."
- )
if self.max_nan_rows:
if self.max_nan_rows < 0:
raise ValueError(
@@ -1902,10 +1892,8 @@ def fit(self, X: FEATURES, y: TARGET | None = None) -> Imputer:
self._imputers = {}
# Load the imputer class from sklearn or cuml (different modules)
- estimator = self._get_est_class(
- name="SimpleImputer",
- module="preprocessing" if self.engine["estimator"] == "cuml" else "impute",
- )
+ module = "preprocessing" if self.engine.get("estimator") == "cuml" else "impute"
+ estimator = self._get_est_class("SimpleImputer", module)
# Assign an imputer to each column
for name, column in X.items():
@@ -2496,11 +2484,11 @@ class Pruner(BaseEstimator, TransformerMixin, BaseTransformer):
def __init__(
self,
- strategy: str | SEQUENCE = "zscore",
+ strategy: PRUNER_STRATS | SEQUENCE = "zscore",
*,
method: SCALAR | Literal["drop", "minmax"] = "drop",
max_sigma: SCALAR = 3,
- include_target: bool = False,
+ include_target: BOOL = False,
device: str = "cpu",
engine: ENGINE = {"data": "numpy", "estimator": "sklearn"},
verbose: Literal[0, 1, 2] = 0,
@@ -2800,8 +2788,8 @@ class Scaler(BaseEstimator, TransformerMixin, BaseTransformer):
def __init__(
self,
- strategy: str = "standard",
- include_binary: bool = False,
+ strategy: SCALER_STRATS = "standard",
+ include_binary: BOOL = False,
*,
device: str = "cpu",
engine: ENGINE = {"data": "numpy", "estimator": "sklearn"},
@@ -2853,14 +2841,8 @@ def fit(self, X: FEATURES, y: TARGET | None = None) -> Scaler:
robust="RobustScaler",
)
- if self.strategy in strategies:
- estimator = self._get_est_class(strategies[self.strategy], "preprocessing")
- self._estimator = estimator(**self.kwargs)
- else:
- raise ValueError(
- f"Invalid value for the strategy parameter, got {self.strategy}. "
- f"Choose from: {', '.join(strategies)}."
- )
+ estimator = self._get_est_class(strategies[self.strategy], "preprocessing")
+ self._estimator = estimator(**self.kwargs)
self.log("Fitting Scaler...", 1)
self._estimator.fit(X[self._num_cols])
diff --git a/atom/ensembles.py b/atom/ensembles.py
index 50cfec4f2..3763bd138 100644
--- a/atom/ensembles.py
+++ b/atom/ensembles.py
@@ -381,7 +381,7 @@ def fit(
X: FEATURES,
y: SEQUENCE,
sample_weight: SEQUENCE | None = None,
- ) -> VotingRegressor:
+ ) -> StackingClassifier:
"""Fit the estimators, skipping prefit ones.
Parameters
diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py
index 3c5482582..8f4238032 100644
--- a/atom/feature_engineering.py
+++ b/atom/feature_engineering.py
@@ -13,7 +13,7 @@
from collections import defaultdict
from logging import Logger
from random import sample
-from typing import Callable, Literal
+from typing import Literal
import featuretools as ft
import joblib
@@ -36,10 +36,10 @@
from atom.basetransformer import BaseTransformer
from atom.data_cleaning import Scaler, TransformerMixin
from atom.models import MODELS
-from atom.plots import FeatureSelectorPlot
+from atom.plots import FeatureSelectionPlot
from atom.utils.types import (
- BOOL, DATAFRAME, ENGINE, FEATURES, FLOAT, INT, INT_TYPES, SCALAR, SEQUENCE,
- SEQUENCE_TYPES, SERIES_TYPES, TARGET,
+ BOOL, DATAFRAME, ENGINE, ESTIMATOR, FEATURES, FLOAT, INT, INT_TYPES,
+ SCALAR, SEQUENCE, SEQUENCE_TYPES, SERIES_TYPES, TARGET,
)
from atom.utils.utils import (
CustomDict, check_is_fitted, check_scaling, composed, crash,
@@ -844,7 +844,7 @@ class FeatureSelector(
BaseEstimator,
TransformerMixin,
BaseTransformer,
- FeatureSelectorPlot,
+ FeatureSelectionPlot,
):
"""Reduce the number of features in the data.
@@ -1118,7 +1118,7 @@ def __init__(
self,
strategy: str | None = None,
*,
- solver: str | Callable | None = None,
+ solver: str | ESTIMATOR | None = None,
n_features: SCALAR | None = None,
min_repeated: SCALAR | None = 2,
max_repeated: SCALAR | None = 1.0,
diff --git a/atom/models/__init__.py b/atom/models/__init__.py
new file mode 100644
index 000000000..54274dfa9
--- /dev/null
+++ b/atom/models/__init__.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module for models.
+
+To add new models note the following:
+
+1. Add the class in the right file depending on task.
+2. Models are ordered alphabetically.
+3. Models have the following structure:
+
+ Class attributes
+ ----------------
+ acronym: str
+ Acronym of the model's name.
+
+ needs_scaling: bool
+ Whether the model needs scaled features.
+
+ accepts_sparse: bool
+ Whether the model has native support for sparse matrices.
+
+ native_multilabel: bool
+ Whether the model has native support for multilabel tasks.
+
+ native_multioutput: bool
+ Whether the model has native support for multioutput tasks.
+
+ has_validation: str or None
+ Whether the model allows in-training validation. If str,
+ name of the estimator's parameter that states the number
+ of iterations. If None, no support for in-training
+ validation.
+
+ supports_engines: list
+ Engines that can be used to run this model.
+
+ _module: str
+ Module from which to load the class. If one of engines,
+ ignore the engine name, i.e. use "ensemble" instead of
+ "sklearn.ensemble".
+
+ _estimators: CustomDict
+ Name of the estimators per goal.
+
+ Instance attributes
+ -------------------
+ name: str
+ Name of the model. Defaults to the same as the acronym
+ but can be different if the same model is called multiple
+ times. The name is assigned in the basemodel.py module.
+
+ Methods
+ -------
+ _get_parameters(self, x) -> CustomDict:
+ Return the trial's suggestions with rounded decimals and
+ (optionally) custom changes to the params. Don't implement
+ if the parent's implementation is sufficient.
+
+ _trial_to_est(self, params) -> CustomDict:
+ Convert trial's hyperparameters to parameters for the
+ estimator. Only implement for models whose study params are
+ different from those for the estimator.
+
+ _fit_estimator(self, estimator, data, est_params_fit, validation, trial):
+ This method is called to fit the estimator. Implement only
+ to customize the fit.
+
+ _get_distributions(self) -> CustomDict:
+ Return a list of the hyperparameter distributions for
+ optimization.
+
+"""
+
+from atom.basemodel import ClassRegModel
+from atom.models.classreg import (
+ AdaBoost, AutomaticRelevanceDetermination, Bagging, BayesianRidge,
+ BernoulliNB, CatBoost, CategoricalNB, ComplementNB, DecisionTree, Dummy,
+ ElasticNet, ExtraTree, ExtraTrees, GaussianNB, GaussianProcess,
+ GradientBoostingMachine, HistGradientBoosting, HuberRegression,
+ KNearestNeighbors, Lasso, LeastAngleRegression, LightGBM,
+ LinearDiscriminantAnalysis, LinearSVM, LogisticRegression,
+ MultiLayerPerceptron, MultinomialNB, OrdinaryLeastSquares,
+ OrthogonalMatchingPursuit, PassiveAggressive, Perceptron,
+ QuadraticDiscriminantAnalysis, RadiusNearestNeighbors, RandomForest, Ridge,
+ StochasticGradientDescent, SupportVectorMachine, XGBoost,
+)
+from atom.models.ensembles import Stacking, Voting
+from atom.models.ts import (
+ ARIMA, ETS, AutoARIMA, ExponentialSmoothing, NaiveForecaster,
+ PolynomialTrend,
+)
+from atom.utils.types import PREDICTOR
+from atom.utils.utils import ClassMap
+
+
+# Available models
+MODELS = ClassMap(
+ AdaBoost,
+ ARIMA,
+ AutoARIMA,
+ AutomaticRelevanceDetermination,
+ Bagging,
+ BayesianRidge,
+ BernoulliNB,
+ CatBoost,
+ CategoricalNB,
+ ComplementNB,
+ DecisionTree,
+ Dummy,
+ ElasticNet,
+ ETS,
+ ExponentialSmoothing,
+ ExtraTree,
+ ExtraTrees,
+ GaussianNB,
+ GaussianProcess,
+ GradientBoostingMachine,
+ HuberRegression,
+ HistGradientBoosting,
+ KNearestNeighbors,
+ Lasso,
+ LeastAngleRegression,
+ LightGBM,
+ LinearDiscriminantAnalysis,
+ LinearSVM,
+ LogisticRegression,
+ MultiLayerPerceptron,
+ MultinomialNB,
+ NaiveForecaster,
+ OrdinaryLeastSquares,
+ OrthogonalMatchingPursuit,
+ PassiveAggressive,
+ Perceptron,
+ PolynomialTrend,
+ QuadraticDiscriminantAnalysis,
+ RadiusNearestNeighbors,
+ RandomForest,
+ Ridge,
+ StochasticGradientDescent,
+ SupportVectorMachine,
+ XGBoost,
+ key="acronym",
+)
+
+# Available ensembles
+ENSEMBLES = ClassMap(Stacking, Voting, key="acronym")
+
+# Available models + ensembles
+MODELS_ENSEMBLES = ClassMap(*MODELS, *ENSEMBLES, key="acronym")
+
+
+class CustomModel(ClassRegModel):
+ """Model with estimator provided by user."""
+
+ def __init__(self, **kwargs):
+ if callable(est := kwargs.pop("estimator")): # Estimator provided by the user
+ self._est = est
+ self._params = {}
+ else:
+ self._est = est.__class__
+ self._params = est.get_params() # Store the provided parameters
+
+ if hasattr(est, "name"):
+ name = est.name
+ else:
+ # If no name is provided, use the name of the class
+ name = self._fullname
+ if len(n := list(filter(str.isupper, name))) >= 2 and n not in MODELS:
+ name = "".join(n)
+
+ self.acronym = getattr(est, "acronym", name)
+ if not name.startswith(self.acronym):
+ raise ValueError(
+ f"The name ({name}) and acronym ({self.acronym}) of model "
+ f"{self._fullname} do not match. The name should start with "
+ f"the model's acronym."
+ )
+
+ self.needs_scaling = getattr(est, "needs_scaling", False)
+ self.native_multilabel = getattr(est, "native_multilabel", False)
+ self.native_multioutput = getattr(est, "native_multioutput", False)
+ self.has_validation = getattr(est, "has_validation", None)
+
+ super().__init__(name=name, **kwargs)
+
+ @property
+ def _fullname(self) -> str:
+ """Return the estimator's class name."""
+ return self._est_class.__name__
+
+ @property
+ def _est_class(self):
+ """Return the estimator's class."""
+ return self._est
+
+ def _get_est(self, **params) -> PREDICTOR:
+ """Get the model's estimator with unpacked parameters.
+
+ Returns
+ -------
+ PREDICTOR
+ Estimator instance.
+
+ """
+ return super()._get_est(**{**self._params, **params})
diff --git a/atom/models.py b/atom/models/classreg.py
similarity index 75%
rename from atom/models.py
rename to atom/models/classreg.py
index c29b6e52a..abb83c5bb 100644
--- a/atom/models.py
+++ b/atom/models/classreg.py
@@ -1,4081 +1,3254 @@
-# -*- coding: utf-8 -*-
-
-"""
-Automated Tool for Optimized Modelling (ATOM)
-Author: Mavs
-Description: Module containing all available models. The models are
- ordered alphabetically. Classes must have the following
- structure:
-
- Class attributes
- ----------------
- acronym: str
- Acronym of the model's name.
-
- needs_scaling: bool
- Whether the model needs scaled features.
-
- accepts_sparse: bool
- Whether the model has native support for sparse matrices.
-
- native_multilabel: bool
- Whether the model has native support for multilabel tasks.
-
- native_multioutput: bool
- Whether the model has native support for multioutput tasks.
-
- has_validation: str or None
- Whether the model allows in-training validation. If str,
- name of the estimator's parameter that states the number
- of iterations. If None, no support for in-training
- validation.
-
- supports_engines: list
- Engines that can be used to run this model.
-
- _module: str
- Module from which to load the class. If one of engines,
- ignore the engine name, i.e. use "ensemble" instead of
- "sklearn.ensemble".
-
- _estimators: CustomDict
- Name of the estimators per goal.
-
- Instance attributes
- -------------------
- name: str
- Name of the model. Defaults to the same as the acronym
- but can be different if the same model is called multiple
- times. The name is assigned in the basemodel.py module.
-
- Methods
- -------
- _get_parameters(self, x) -> CustomDict:
- Return the trial's suggestions with rounded decimals and
- (optionally) custom changes to the params. Don't implement
- if the parent's implementation is sufficient.
-
- _trial_to_est(self, params) -> CustomDict:
- Convert trial's hyperparameters to parameters for the
- estimator. Only implement for models whose study params are
- different than those for the estimator.
-
- _fit_estimator(self, estimator, data, est_params_fit, validation, trial):
- This method is called to fit the estimator. Implement only
- to customize the fit.
-
- _get_distributions(self) -> CustomDict:
- Return a list of the hyperparameter distributions for
- optimization.
-
-"""
-
-from __future__ import annotations
-
-import numpy as np
-from optuna.distributions import CategoricalDistribution as Cat
-from optuna.distributions import FloatDistribution as Float
-from optuna.distributions import IntDistribution as Int
-from optuna.exceptions import TrialPruned
-from optuna.integration import (
- CatBoostPruningCallback, LightGBMPruningCallback, XGBoostPruningCallback,
-)
-from optuna.trial import Trial
-
-from atom.basemodel import ClassRegModel, ForecastModel
-from atom.pipeline import Pipeline
-from atom.utils.types import DATAFRAME, PREDICTOR, SERIES
-from atom.utils.utils import (
- CatBMetric, ClassMap, CustomDict, LGBMetric, XGBMetric, sign,
-)
-
-
-# Custom models ==================================================== >>
-
-class CustomModel(ClassRegModel):
- """Model with estimator provided by user."""
-
- def __init__(self, **kwargs):
- if callable(est := kwargs.pop("estimator")): # Estimator provided by the user
- self._est = est
- self._params = {}
- else:
- self._est = est.__class__
- self._params = est.get_params() # Store the provided parameters
-
- if hasattr(est, "name"):
- name = est.name
- else:
- # If no name is provided, use the name of the class
- name = self._fullname
- if len(n := list(filter(str.isupper, name))) >= 2 and n not in MODELS:
- name = "".join(n)
-
- self.acronym = getattr(est, "acronym", name)
- if not name.startswith(self.acronym):
- raise ValueError(
- f"The name ({name}) and acronym ({self.acronym}) of model "
- f"{self._fullname} do not match. The name should start with "
- f"the model's acronym."
- )
-
- self.needs_scaling = getattr(est, "needs_scaling", False)
- self.native_multilabel = getattr(est, "native_multilabel", False)
- self.native_multioutput = getattr(est, "native_multioutput", False)
- self.has_validation = getattr(est, "has_validation", None)
-
- super().__init__(name=name, **kwargs)
-
- @property
- def _fullname(self) -> str:
- """Return the estimator's class name."""
- return self._est_class.__name__
-
- @property
- def _est_class(self):
- """Return the estimator's class."""
- return self._est
-
- def _get_est(self, **params) -> PREDICTOR:
- """Get the model's estimator with unpacked parameters.
-
- Returns
- -------
- PREDICTOR
- Estimator instance.
-
- """
- return super()._get_est(**{**self._params, **params})
-
-
-# Classification and Regression models ============================= >>
-
-class AdaBoost(ClassRegModel):
- """Adaptive Boosting (with decision tree as base estimator).
-
- AdaBoost is a meta-estimator that begins by fitting a
- classifier/regressor on the original dataset and then fits
- additional copies of the algorithm on the same dataset but where
- the weights of instances are adjusted according to the error of
- the current prediction.
-
- Corresponding estimators are:
-
- - [AdaBoostClassifier][] for classification tasks.
- - [AdaBoostRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][adabdocs].
-
- See Also
- --------
- atom.models:GradientBoostingMachine
- atom.models:RandomForest
- atom.models:XGBoost
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="AdaB", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "AdaB"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "ensemble"
- _estimators = CustomDict({"class": "AdaBoostClassifier", "reg": "AdaBoostRegressor"})
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- dist = CustomDict(
- n_estimators=Int(50, 500, step=10),
- learning_rate=Float(0.01, 10, log=True),
- )
-
- if self.goal == "class":
- dist["algorithm"] = Cat(["SAMME.R", "SAMME"])
- else:
- dist["loss"] = Cat(["linear", "square", "exponential"])
-
- return dist
-
-
-class AutomaticRelevanceDetermination(ClassRegModel):
- """Automatic Relevance Determination.
-
- Automatic Relevance Determination is very similar to
- [BayesianRidge][], but can lead to sparser coefficients. Fit the
- weights of a regression model, using an ARD prior. The weights of
- the regression model are assumed to be in Gaussian distributions.
-
- Corresponding estimators are:
-
- - [ARDRegression][] for regression tasks.
-
- Read more in sklearn's [documentation][arddocs].
-
- See Also
- --------
- atom.models:BayesianRidge
- atom.models:GaussianProcess
- atom.models:LeastAngleRegression
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import fetch_california_housing
-
- X, y = fetch_california_housing(return_X_y=True)
-
- atom = ATOMRegressor(X, y, random_state=1)
- atom.run(models="ARD", metric="r2", verbose=2)
- ```
-
- """
-
- acronym = "ARD"
- needs_scaling = True
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "linear_model"
- _estimators = CustomDict({"reg": "ARDRegression"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- n_iter=Int(100, 1000, step=10),
- alpha_1=Float(1e-4, 1, log=True),
- alpha_2=Float(1e-4, 1, log=True),
- lambda_1=Float(1e-4, 1, log=True),
- lambda_2=Float(1e-4, 1, log=True),
- )
-
-
-class Bagging(ClassRegModel):
- """Bagging model (with decision tree as base estimator).
-
- Bagging uses an ensemble meta-estimator that fits base predictors
- on random subsets of the original dataset and then aggregate their
- individual predictions (either by voting or by averaging) to form a
- final prediction. Such a meta-estimator can typically be used as a
- way to reduce the variance of a black-box estimator by introducing
- randomization into its construction procedure and then making an
- ensemble out of it.
-
- Corresponding estimators are:
-
- - [BaggingClassifier][] for classification tasks.
- - [BaggingRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][bagdocs].
-
- See Also
- --------
- atom.models:DecisionTree
- atom.models:LogisticRegression
- atom.models:RandomForest
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="Bag", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "Bag"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "ensemble"
- _estimators = CustomDict({"class": "BaggingClassifier", "reg": "BaggingRegressor"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- n_estimators=Int(10, 500, step=10),
- max_samples=Float(0.5, 1.0, step=0.1),
- max_features=Float(0.5, 1.0, step=0.1),
- bootstrap=Cat([True, False]),
- bootstrap_features=Cat([True, False]),
- )
-
-
-class BayesianRidge(ClassRegModel):
- """Bayesian ridge regression.
-
- Bayesian regression techniques can be used to include regularization
- parameters in the estimation procedure: the regularization parameter
- is not set in a hard sense but tuned to the data at hand.
-
- Corresponding estimators are:
-
- - [BayesianRidge][bayesianridgeclass] for regression tasks.
-
- Read more in sklearn's [documentation][brdocs].
-
- See Also
- --------
- atom.models:AutomaticRelevanceDetermination
- atom.models:GaussianProcess
- atom.models:LeastAngleRegression
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import fetch_california_housing
-
- X, y = fetch_california_housing(return_X_y=True)
-
- atom = ATOMRegressor(X, y, random_state=1)
- atom.run(models="BR", metric="r2", verbose=2)
- ```
-
- """
-
- acronym = "BR"
- needs_scaling = True
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "linear_model"
- _estimators = CustomDict({"reg": "BayesianRidge"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- n_iter=Int(100, 1000, step=10),
- alpha_1=Float(1e-4, 1, log=True),
- alpha_2=Float(1e-4, 1, log=True),
- lambda_1=Float(1e-4, 1, log=True),
- lambda_2=Float(1e-4, 1, log=True),
- )
-
-
-class BernoulliNB(ClassRegModel):
- """Bernoulli Naive Bayes.
-
- BernoulliNB implements the Naive Bayes algorithm for multivariate
- Bernoulli models. Like [MultinomialNB][], this classifier is
- suitable for discrete data. The difference is that while MNB works
- with occurrence counts, BNB is designed for binary/boolean features.
-
- Corresponding estimators are:
-
- - [BernoulliNB][bernoullinbclass] for classification tasks.
-
- Read more in sklearn's [documentation][bnbdocs].
-
- See Also
- --------
- atom.models:ComplementNB
- atom.models:CategoricalNB
- atom.models:MultinomialNB
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="BNB", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "BNB"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "cuml"]
-
- _module = "naive_bayes"
- _estimators = CustomDict({"class": "BernoulliNB"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- alpha=Float(0.01, 10, log=True),
- fit_prior=Cat([True, False]),
- )
-
-
-class CatBoost(ClassRegModel):
- """Cat Boosting Machine.
-
- CatBoost is a machine learning method based on gradient boosting
- over decision trees. Main advantages of CatBoost:
-
- - Superior quality when compared with other GBDT models on many
- datasets.
- - Best in class prediction speed.
-
- Corresponding estimators are:
-
- - [CatBoostClassifier][] for classification tasks.
- - [CatBoostRegressor][] for regression tasks.
-
- Read more in CatBoost's [documentation][catbdocs].
-
- !!! warning
- * CatBoost selects the weights achieved by the best evaluation
- on the test set after training. This means that, by default,
- there is some minor data leakage in the test set. Use the
- `use_best_model=False` parameter to avoid this behavior or use
- a [holdout set][data-sets] to evaluate the final estimator.
- * [In-training validation][] and [pruning][] are disabled when
- `#!python device="gpu"`.
-
- !!! note
- ATOM uses CatBoost's `n_estimators` parameter instead of
- `iterations` to indicate the number of trees to fit. This is
- done to have consistent naming with the [XGBoost][] and
- [LightGBM][] models.
-
- See Also
- --------
- atom.models:GradientBoostingMachine
- atom.models:LightGBM
- atom.models:XGBoost
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="CatB", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "CatB"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = "n_estimators"
- supports_engines = ["catboost"]
-
- _module = "catboost"
- _estimators = CustomDict({"class": "CatBoostClassifier", "reg": "CatBoostRegressor"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if self._get_param("bootstrap_type", params) == "Bernoulli":
- params.pop("bagging_temperature")
- elif self._get_param("bootstrap_type", params) == "Bayesian":
- params.pop("subsample")
-
- return params
-
- def _get_est(self, **params) -> PREDICTOR:
- """Get the estimator instance.
-
- Parameters
- ----------
- **params
- Unpacked hyperparameters for the estimator.
-
- Returns
- -------
- Predictor
- Estimator instance.
-
- """
- eval_metric = None
- if getattr(self, "_metric", None) and not self._gpu:
- eval_metric = CatBMetric(self._metric[0], task=self.task)
-
- return self._est_class(
- eval_metric=params.pop("eval_metric", eval_metric),
- train_dir=params.pop("train_dir", ""),
- allow_writing_files=params.pop("allow_writing_files", False),
- thread_count=params.pop("n_jobs", self.n_jobs),
- task_type=params.pop("task_type", "GPU" if self._gpu else "CPU"),
- devices=str(self._device_id),
- verbose=params.pop("verbose", False),
- random_state=params.pop("random_state", self.random_state),
- **params,
- )
-
- def _fit_estimator(
- self,
- estimator: PREDICTOR,
- data: tuple[DATAFRAME, SERIES],
- est_params_fit: dict,
- validation: tuple[DATAFRAME, SERIES] | None = None,
- trial: Trial | None = None,
- ):
- """Fit the estimator and perform in-training validation.
-
- Parameters
- ----------
- estimator: Predictor
- Instance to fit.
-
- data: tuple
- Training data of the form (X, y).
-
- est_params_fit: dict
- Additional parameters for the estimator's fit method.
-
- validation: tuple or None
- Validation data of the form (X, y). If None, no validation
- is performed.
-
- trial: [Trial][] or None
- Active trial (during hyperparameter tuning).
-
- Returns
- -------
- Predictor
- Fitted instance.
-
- """
- params = est_params_fit.copy()
-
- callbacks = params.pop("callbacks", [])
- if trial and len(self._metric) == 1 and not self._gpu:
- callbacks.append(cb := CatBoostPruningCallback(trial, "CatBMetric"))
-
- # gpu implementation fails if callbacks!=None
- estimator.fit(*data, eval_set=validation, callbacks=callbacks or None, **params)
-
- if not self._gpu:
- if validation:
- # Create evals attribute with train and validation scores
- m = self._metric[0].name
- evals = estimator.evals_result_
- self._evals[f"{m}_train"] = evals["learn"]["CatBMetric"]
- self._evals[f"{m}_test"] = evals["validation"]["CatBMetric"]
-
- if trial and len(self._metric) == 1 and cb._pruned:
- # Add the pruned step to the output
- step = len(self.evals[f'{m}_train'])
- steps = estimator.get_params()[self.has_validation]
- trial.params[self.has_validation] = f"{step}/{steps}"
-
- trial.set_user_attr("estimator", estimator)
- raise TrialPruned(cb._message)
-
- return estimator
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- n_estimators=Int(20, 500, step=10),
- learning_rate=Float(0.01, 1.0, log=True),
- max_depth=Cat([None, *range(1, 17)]),
- min_child_samples=Int(1, 30),
- bootstrap_type=Cat(["Bayesian", "Bernoulli"]),
- bagging_temperature=Float(0, 10),
- subsample=Float(0.5, 1.0, step=0.1),
- reg_lambda=Float(0.001, 100, log=True),
- )
-
-
-class CategoricalNB(ClassRegModel):
- """Categorical Naive Bayes.
-
- Categorical Naive Bayes implements the Naive Bayes algorithm for
- categorical features.
-
- Corresponding estimators are:
-
- - [CategoricalNB][categoricalnbclass] for classification tasks.
-
- Read more in sklearn's [documentation][catnbdocs].
-
- See Also
- --------
- atom.models:BernoulliNB
- atom.models:ComplementNB
- atom.models:GaussianNB
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- import numpy as np
-
- X = np.random.randint(5, size=(100, 100))
- y = np.random.randint(2, size=100)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="CatNB", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "CatNB"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "cuml"]
-
- _module = "naive_bayes"
- _estimators = CustomDict({"class": "CategoricalNB"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- alpha=Float(0.01, 10, log=True),
- fit_prior=Cat([True, False]),
- )
-
-
-class ComplementNB(ClassRegModel):
- """Complement Naive Bayes.
-
- The Complement Naive Bayes classifier was designed to correct the
- "severe assumptions" made by the standard [MultinomialNB][]
- classifier. It is particularly suited for imbalanced datasets.
-
- Corresponding estimators are:
-
- - [ComplementNB][complementnbclass] for classification tasks.
-
- Read more in sklearn's [documentation][cnbdocs].
-
- See Also
- --------
- atom.models:BernoulliNB
- atom.models:CategoricalNB
- atom.models:MultinomialNB
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="CNB", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "CNB"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "cuml"]
-
- _module = "naive_bayes"
- _estimators = CustomDict({"class": "ComplementNB"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- alpha=Float(0.01, 10, log=True),
- fit_prior=Cat([True, False]),
- norm=Cat([True, False]),
- )
-
-
-class DecisionTree(ClassRegModel):
- """Single Decision Tree.
-
- A single decision tree classifier/regressor.
-
- Corresponding estimators are:
-
- - [DecisionTreeClassifier][] for classification tasks.
- - [DecisionTreeRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][treedocs].
-
- See Also
- --------
- atom.models:ExtraTree
- atom.models:ExtraTrees
- atom.models:RandomForest
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="Tree", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "Tree"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = True
- native_multioutput = True
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "tree"
- _estimators = CustomDict(
- {"class": "DecisionTreeClassifier", "reg": "DecisionTreeRegressor"}
- )
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- if self.goal == "class":
- criterion = ["gini", "entropy"]
- else:
- criterion = ["squared_error", "absolute_error", "friedman_mse", "poisson"]
-
- return CustomDict(
- criterion=Cat(criterion),
- splitter=Cat(["best", "random"]),
- max_depth=Cat([None, *range(1, 17)]),
- min_samples_split=Int(2, 20),
- min_samples_leaf=Int(1, 20),
- max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
- ccp_alpha=Float(0, 0.035, step=0.005),
- )
-
-
-class Dummy(ClassRegModel):
- """Dummy classifier/regressor.
-
- When doing supervised learning, a simple sanity check consists of
- comparing one's estimator against simple rules of thumb. The
- prediction methods completely ignore the input data. Do not use
- this model for real problems. Use it only as a simple baseline
- to compare with other models.
-
- Corresponding estimators are:
-
- - [DummyClassifier][] for classification tasks.
- - [DummyRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][dummydocs].
-
- See Also
- --------
- atom.models:DecisionTree
- atom.models:ExtraTree
- atom.models:NaiveForecaster
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="Dummy", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "Dummy"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "dummy"
- _estimators = CustomDict({"class": "DummyClassifier", "reg": "DummyRegressor"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if self._get_param("strategy", params) != "quantile":
- params.pop("quantile")
-
- return params
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- dist = CustomDict()
- if self.goal == "class":
- dist["strategy"] = Cat(["most_frequent", "prior", "stratified", "uniform"])
- else:
- dist["strategy"] = Cat(["mean", "median", "quantile"])
- dist["quantile"] = Float(0, 1.0, step=0.1)
-
- return dist
-
-
-class ElasticNet(ClassRegModel):
- """Linear Regression with elasticnet regularization.
-
- Linear least squares with l1 and l2 regularization.
-
- Corresponding estimators are:
-
- - [ElasticNet][elasticnetreg] for regression tasks.
-
- Read more in sklearn's [documentation][endocs].
-
- See Also
- --------
- atom.models:Lasso
- atom.models:OrdinaryLeastSquares
- atom.models:Ridge
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import fetch_california_housing
-
- X, y = fetch_california_housing(return_X_y=True)
-
- atom = ATOMRegressor(X, y, random_state=1)
- atom.run(models="EN", metric="r2", verbose=2)
- ```
-
- """
-
- acronym = "EN"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
-
- _module = "linear_model"
- _estimators = CustomDict({"reg": "ElasticNet"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- alpha=Float(1e-3, 10, log=True),
- l1_ratio=Float(0.1, 0.9, step=0.1),
- selection=Cat(["cyclic", "random"]),
- )
-
-
-class ExtraTree(ClassRegModel):
- """Extremely Randomized Tree.
-
- Extra-trees differ from classic decision trees in the way they are
- built. When looking for the best split to separate the samples of a
- node into two groups, random splits are drawn for each of the
- max_features randomly selected features and the best split among
- those is chosen. When max_features is set 1, this amounts to
- building a totally random decision tree.
-
- Corresponding estimators are:
-
- - [ExtraTreeClassifier][] for classification tasks.
- - [ExtraTreeRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][treedocs].
-
- See Also
- --------
- atom.models:DecisionTree
- atom.models:ExtraTrees
- atom.models:RandomForest
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="ETree", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "ETree"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = True
- native_multioutput = True
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "tree"
- _estimators = CustomDict(
- {"class": "ExtraTreeClassifier", "reg": "ExtraTreeRegressor"}
- )
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if not self._get_param("bootstrap", params):
- params.pop("max_samples")
-
- return params
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- if self.goal == "class":
- criterion = ["gini", "entropy"]
- else:
- criterion = ["squared_error", "absolute_error"]
-
- return CustomDict(
- criterion=Cat(criterion),
- splitter=Cat(["random", "best"]),
- max_depth=Cat([None, *range(1, 17)]),
- min_samples_split=Int(2, 20),
- min_samples_leaf=Int(1, 20),
- max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
- ccp_alpha=Float(0, 0.035, step=0.005),
- )
-
-
-class ExtraTrees(ClassRegModel):
- """Extremely Randomized Trees.
-
- Extra-Trees use a meta estimator that fits a number of randomized
- decision trees (a.k.a. [extra-trees][extratree]) on various
- sub-samples of the dataset and uses averaging to improve the
- predictive accuracy and control over-fitting.
-
- Corresponding estimators are:
-
- - [ExtraTreesClassifier][] for classification tasks.
- - [ExtraTreesRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][etdocs].
-
- See Also
- --------
- atom.models:DecisionTree
- atom.models:ExtraTree
- atom.models:RandomForest
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="ET", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "ET"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = True
- native_multioutput = True
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "ensemble"
- _estimators = CustomDict(
- {"class": "ExtraTreesClassifier", "reg": "ExtraTreesRegressor"}
- )
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if not self._get_param("bootstrap", params):
- params.pop("max_samples")
-
- return params
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- if self.goal == "class":
- criterion = ["gini", "entropy"]
- else:
- criterion = ["squared_error", "absolute_error"]
-
- return CustomDict(
- n_estimators=Int(10, 500, step=10),
- criterion=Cat(criterion),
- max_depth=Cat([None, *range(1, 17)]),
- min_samples_split=Int(2, 20),
- min_samples_leaf=Int(1, 20),
- max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
- bootstrap=Cat([True, False]),
- max_samples=Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]),
- ccp_alpha=Float(0, 0.035, step=0.005),
- )
-
-
-class GaussianNB(ClassRegModel):
- """Gaussian Naive Bayes.
-
- Gaussian Naive Bayes implements the Naive Bayes algorithm for
- classification. The likelihood of the features is assumed to
- be Gaussian.
-
- Corresponding estimators are:
-
- - [GaussianNB][gaussiannbclass] for classification tasks.
-
- Read more in sklearn's [documentation][gnbdocs].
-
- See Also
- --------
- atom.models:BernoulliNB
- atom.models:CategoricalNB
- atom.models:ComplementNB
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="GNB", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "GNB"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "cuml"]
-
- _module = "naive_bayes"
- _estimators = CustomDict({"class": "GaussianNB"})
-
-
-class GaussianProcess(ClassRegModel):
- """Gaussian process.
-
- Gaussian Processes are a generic supervised learning method
- designed to solve regression and probabilistic classification
- problems. The advantages of Gaussian processes are:
-
- * The prediction interpolates the observations.
- * The prediction is probabilistic (Gaussian) so that one can compute
- empirical confidence intervals and decide based on those if one
- should refit (online fitting, adaptive fitting) the prediction in
- some region of interest.
-
- The disadvantages of Gaussian processes include:
-
- * They are not sparse, i.e. they use the whole samples/features
- information to perform the prediction.
- * They lose efficiency in high dimensional spaces, namely when the
- number of features exceeds a few dozens.
-
- Corresponding estimators are:
-
- - [GaussianProcessClassifier][] for classification tasks.
- - [GaussianProcessRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][gpdocs].
-
- See Also
- --------
- atom.models:GaussianNB
- atom.models:LinearDiscriminantAnalysis
- atom.models:PassiveAggressive
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="GP", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "GP"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "gaussian_process"
- _estimators = CustomDict(
- {"class": "GaussianProcessClassifier", "reg": "GaussianProcessRegressor"}
- )
-
-
-class GradientBoostingMachine(ClassRegModel):
- """Gradient Boosting Machine.
-
- A Gradient Boosting Machine builds an additive model in a forward
- stage-wise fashion; it allows for the optimization of arbitrary
- differentiable loss functions. In each stage `n_classes_` regression
- trees are fit on the negative gradient of the loss function, e.g.
- binary or multiclass log loss. Binary classification is a special
- case where only a single regression tree is induced.
-
- Corresponding estimators are:
-
- - [GradientBoostingClassifier][] for classification tasks.
- - [GradientBoostingRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][gbmdocs].
-
- !!! tip
- [HistGradientBoosting][] is a much faster variant of this
- algorithm for intermediate datasets (n_samples >= 10k).
-
- See Also
- --------
- atom.models:CatBoost
- atom.models:HistGradientBoosting
- atom.models:LightGBM
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="GBM", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "GBM"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "ensemble"
- _estimators = CustomDict(
- {"class": "GradientBoostingClassifier", "reg": "GradientBoostingRegressor"}
- )
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if self._get_param("loss", params) not in ("huber", "quantile"):
- params.pop("alpha")
-
- return params
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- dist = CustomDict(
- loss=Cat(["log_loss", "exponential"]),
- learning_rate=Float(0.01, 1.0, log=True),
- n_estimators=Int(10, 500, step=10),
- subsample=Float(0.5, 1.0, step=0.1),
- criterion=Cat(["friedman_mse", "squared_error"]),
- min_samples_split=Int(2, 20),
- min_samples_leaf=Int(1, 20),
- max_depth=Int(1, 21),
- max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
- ccp_alpha=Float(0, 0.035, step=0.005),
- )
-
- if self.task.startswith("multiclass"):
- dist.pop("loss") # Multiclass only supports log_loss
- elif self.goal.startswith("reg"):
- dist["loss"] = Cat(["squared_error", "absolute_error", "huber", "quantile"])
- dist["alpha"] = Float(0.1, 0.9, step=0.1)
-
- return dist
-
-
-class HuberRegression(ClassRegModel):
- """Huber regressor.
-
- Huber is a linear regression model that is robust to outliers. It
- makes sure that the loss function is not heavily influenced by the
- outliers while not completely ignoring their effect.
-
- Corresponding estimators are:
-
- - [HuberRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][huberdocs].
-
- See Also
- --------
- atom.models:AutomaticRelevanceDetermination
- atom.models:LeastAngleRegression
- atom.models:OrdinaryLeastSquares
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import fetch_california_housing
-
- X, y = fetch_california_housing(return_X_y=True)
-
- atom = ATOMRegressor(X, y, random_state=1)
- atom.run(models="Huber", metric="r2", verbose=2)
- ```
-
- """
-
- acronym = "Huber"
- needs_scaling = True
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "linear_model"
- _estimators = CustomDict({"reg": "HuberRegressor"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- epsilon=Float(1, 10, log=True),
- max_iter=Int(50, 500, step=10),
- alpha=Float(1e-4, 1, log=True),
- )
-
-
-class HistGradientBoosting(ClassRegModel):
- """Histogram-based Gradient Boosting Machine.
-
- This Histogram-based Gradient Boosting Machine is much faster than
- the standard [GradientBoostingMachine][] for big datasets
- (n_samples>=10k). This variation first bins the input samples into
- integer-valued bins which tremendously reduces the number of
- splitting points to consider, and allows the algorithm to leverage
- integer-based data structures (histograms) instead of relying on
- sorted continuous values when building the trees.
-
- Corresponding estimators are:
-
- - [HistGradientBoostingClassifier][] for classification tasks.
- - [HistGradientBoostingRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][hgbmdocs].
-
- See Also
- --------
- atom.models:CatBoost
- atom.models:GradientBoostingMachine
- atom.models:XGBoost
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="hGBM", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "hGBM"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "ensemble"
- _estimators = CustomDict(
- {
- "class": "HistGradientBoostingClassifier",
- "reg": "HistGradientBoostingRegressor",
- }
- )
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- dist = CustomDict(
- loss=Cat(["squared_error", "absolute_error", "poisson", "quantile", "gamma"]),
- learning_rate=Float(0.01, 1.0, log=True),
- max_iter=Int(10, 500, step=10),
- max_leaf_nodes=Int(10, 50),
- max_depth=Cat([None, *range(1, 17)]),
- min_samples_leaf=Int(10, 30),
- l2_regularization=Float(0, 1.0, step=0.1),
- )
-
- if self.goal == "class":
- dist.pop("loss")
-
- return dist
-
-
-class KNearestNeighbors(ClassRegModel):
- """K-Nearest Neighbors.
-
- K-Nearest Neighbors, as the name clearly indicates, implements the
- k-nearest neighbors vote. For regression, the target is predicted
- by local interpolation of the targets associated of the nearest
- neighbors in the training set.
-
- Corresponding estimators are:
-
- - [KNeighborsClassifier][] for classification tasks.
- - [KNeighborsRegressor][] for classification tasks.
-
- Read more in sklearn's [documentation][knndocs].
-
- See Also
- --------
- atom.models:LinearDiscriminantAnalysis
- atom.models:QuadraticDiscriminantAnalysis
- atom.models:RadiusNearestNeighbors
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="KNN", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "KNN"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = True
- native_multioutput = True
- has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
-
- _module = "neighbors"
- _estimators = CustomDict(
- {"class": "KNeighborsClassifier", "reg": "KNeighborsRegressor"}
- )
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- dist = CustomDict(
- n_neighbors=Int(1, 100),
- weights=Cat(["uniform", "distance"]),
- algorithm=Cat(["auto", "ball_tree", "kd_tree", "brute"]),
- leaf_size=Int(20, 40),
- p=Int(1, 2),
- )
-
- if self._gpu:
- dist.pop("algorithm") # Only 'brute' is supported
- if self.engine["estimator"] == "cuml":
- dist.pop("weights") # Only 'uniform' is supported
- dist.pop("leaf_size")
- dist.pop("p")
-
- return dist
-
-
-class Lasso(ClassRegModel):
- """Linear Regression with lasso regularization.
-
- Linear least squares with l1 regularization.
-
- Corresponding estimators are:
-
- - [Lasso][lassoreg] for regression tasks.
-
- Read more in sklearn's [documentation][lassodocs].
-
- See Also
- --------
- atom.models:ElasticNet
- atom.models:OrdinaryLeastSquares
- atom.models:Ridge
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import fetch_california_housing
-
- X, y = fetch_california_housing(return_X_y=True)
-
- atom = ATOMRegressor(X, y, random_state=1)
- atom.run(models="Lasso", metric="r2", verbose=2)
- ```
-
- """
-
- acronym = "Lasso"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
-
- _module = "linear_model"
- _estimators = CustomDict({"reg": "Lasso"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- alpha=Float(1e-3, 10, log=True),
- selection=Cat(["cyclic", "random"]),
- )
-
-
-class LeastAngleRegression(ClassRegModel):
- """Least Angle Regression.
-
- Least-Angle Regression is a regression algorithm for
- high-dimensional data. Lars is similar to forward stepwise
- regression. At each step, it finds the feature most correlated
- with the target. When there are multiple features having equal
- correlation, instead of continuing along the same feature, it
- proceeds in a direction equiangular between the features.
-
- Corresponding estimators are:
-
- - [Lars][] for regression tasks.
-
- Read more in sklearn's [documentation][larsdocs].
-
- See Also
- --------
- atom.models:BayesianRidge
- atom.models:HuberRegression
- atom.models:OrdinaryLeastSquares
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import fetch_california_housing
-
- X, y = fetch_california_housing(return_X_y=True)
-
- atom = ATOMRegressor(X, y, random_state=1)
- atom.run(models="Lars", metric="r2", verbose=2)
- ```
-
- """
-
- acronym = "Lars"
- needs_scaling = True
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "linear_model"
- _estimators = CustomDict({"reg": "Lars"})
-
-
-class LightGBM(ClassRegModel):
- """Light Gradient Boosting Machine.
-
- LightGBM is a gradient boosting model that uses tree based learning
- algorithms. It is designed to be distributed and efficient with the
- following advantages:
-
- - Faster training speed and higher efficiency.
- - Lower memory usage.
- - Better accuracy.
- - Capable of handling large-scale data.
-
- Corresponding estimators are:
-
- - [LGBMClassifier][] for classification tasks.
- - [LGBMRegressor][] for regression tasks.
-
- Read more in LightGBM's [documentation][lgbdocs].
-
- !!! info
- Using LightGBM's [GPU acceleration][estimator-acceleration]
- requires [additional software dependencies][lgb_gpu].
-
- See Also
- --------
- atom.models:CatBoost
- atom.models:GradientBoostingMachine
- atom.models:XGBoost
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="LGB", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "LGB"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = "n_estimators"
- supports_engines = ["lightgbm"]
-
- _module = "lightgbm.sklearn"
- _estimators = CustomDict({"class": "LGBMClassifier", "reg": "LGBMRegressor"})
-
- def _get_est(self, **params) -> PREDICTOR:
- """Get the model's estimator with unpacked parameters.
-
- Returns
- -------
- Predictor
- Estimator instance.
-
- """
- # Custom lightgbm mapping for warnings
- # PYTHONWARNINGS doesn't work since they go from C/C++ code to stdout
- warns = dict(always=2, default=1, error=0, ignore=-1)
-
- return self._est_class(
- verbose=params.pop("verbose", warns.get(self.warnings, -1)),
- n_jobs=params.pop("n_jobs", self.n_jobs),
- device=params.pop("device", "gpu" if self._gpu else "cpu"),
- gpu_device_id=params.pop("gpu_device_id", self._device_id or -1),
- random_state=params.pop("random_state", self.random_state),
- **params,
- )
-
- def _fit_estimator(
- self,
- estimator: PREDICTOR,
- data: tuple[DATAFRAME, SERIES],
- est_params_fit: dict,
- validation: tuple[DATAFRAME, SERIES] | None = None,
- trial: Trial | None = None,
- ):
- """Fit the estimator and perform in-training validation.
-
- Parameters
- ----------
- estimator: Predictor
- Instance to fit.
-
- data: tuple
- Training data of the form (X, y).
-
- est_params_fit: dict
- Additional parameters for the estimator's fit method.
-
- validation: tuple or None
- Validation data of the form (X, y). If None, no validation
- is performed.
-
- trial: [Trial][] or None
- Active trial (during hyperparameter tuning).
-
- Returns
- -------
- Predictor
- Fitted instance.
-
- """
- from lightgbm.callback import log_evaluation
-
- m = self._metric[0].name
- params = est_params_fit.copy()
-
- callbacks = params.pop("callbacks", []) + [log_evaluation(-1)]
- if trial and len(self._metric) == 1:
- callbacks.append(LightGBMPruningCallback(trial, m, "valid_1"))
-
- eval_metric = None
- if getattr(self, "_metric", None):
- eval_metric = LGBMetric(self._metric[0], task=self.task)
-
- try:
- estimator.fit(
- *data,
- eval_set=[data, validation] if validation else None,
- eval_metric=params.pop("eval_metric", eval_metric),
- callbacks=callbacks,
- **params,
- )
- except TrialPruned as ex:
- # Add the pruned step to the output
- step = str(ex).split(" ")[-1][:-1]
- steps = estimator.get_params()[self.has_validation]
- trial.params[self.has_validation] = f"{step}/{steps}"
-
- trial.set_user_attr("estimator", estimator)
- raise ex
-
- if validation:
- # Create evals attribute with train and validation scores
- self._evals[f"{m}_train"] = estimator.evals_result_["training"][m]
- self._evals[f"{m}_test"] = estimator.evals_result_["valid_1"][m]
-
- return estimator
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- n_estimators=Int(20, 500, step=10),
- learning_rate=Float(0.01, 1.0, log=True),
- max_depth=Int(-1, 17, step=2),
- num_leaves=Int(20, 40),
- min_child_weight=Float(1e-4, 100, log=True),
- min_child_samples=Int(1, 30),
- subsample=Float(0.5, 1.0, step=0.1),
- colsample_bytree=Float(0.4, 1.0, step=0.1),
- reg_alpha=Float(1e-4, 100, log=True),
- reg_lambda=Float(1e-4, 100, log=True),
- )
-
-
-class LinearDiscriminantAnalysis(ClassRegModel):
- """Linear Discriminant Analysis.
-
- Linear Discriminant Analysis is a classifier with a linear
- decision boundary, generated by fitting class conditional densities
- to the data and using Bayes’ rule. The model fits a Gaussian
- density to each class, assuming that all classes share the same
- covariance matrix.
-
- Corresponding estimators are:
-
- - [LinearDiscriminantAnalysis][ldaclassifier] for classification tasks.
-
- Read more in sklearn's [documentation][ldadocs].
-
- See Also
- --------
- atom.models:LogisticRegression
- atom.models:RadiusNearestNeighbors
- atom.models:QuadraticDiscriminantAnalysis
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="LDA", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "LDA"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "discriminant_analysis"
- _estimators = CustomDict({"class": "LinearDiscriminantAnalysis"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if self._get_param("solver", params) == "svd":
- params.pop("shrinkage")
-
- return params
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- solver=Cat(["svd", "lsqr", "eigen"]),
- shrinkage=Cat([None, "auto", 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
- )
-
-
-class LinearSVM(ClassRegModel):
- """Linear Support Vector Machine.
-
- Similar to [SupportVectorMachine][] but with a linear kernel.
- Implemented in terms of liblinear rather than libsvm, so it has
- more flexibility in the choice of penalties and loss functions and
- should scale better to large numbers of samples.
-
- Corresponding estimators are:
-
- - [LinearSVC][] for classification tasks.
- - [LinearSVR][] for classification tasks.
-
- Read more in sklearn's [documentation][svmdocs].
-
- See Also
- --------
- atom.models:KNearestNeighbors
- atom.models:StochasticGradientDescent
- atom.models:SupportVectorMachine
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="lSVM", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "lSVM"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "cuml"]
-
- _module = "svm"
- _estimators = CustomDict({"class": "LinearSVC", "reg": "LinearSVR"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if self.goal == "class":
- if self._get_param("loss", params) == "hinge":
- # l1 regularization can't be combined with hinge
- params.replace_value("penalty", "l2")
- # l2 regularization can't be combined with hinge when dual=False
- params.replace_value("dual", True)
- elif self._get_param("loss", params) == "squared_hinge":
- # l1 regularization can't be combined with squared_hinge when dual=True
- if self._get_param("penalty", params) == "l1":
- params.replace_value("dual", False)
- elif self._get_param("loss", params) == "epsilon_insensitive":
- params.replace_value("dual", True)
-
- return params
-
- def _get_est(self, **params) -> PREDICTOR:
- """Get the estimator instance.
-
- Parameters
- ----------
- **params
- Unpacked hyperparameters for the estimator.
-
- Returns
- -------
- Predictor
- Estimator instance.
-
- """
- if self.engine["estimator"] == "cuml" and self.goal == "class":
- return self._est_class(probability=params.pop("probability", True), **params)
- else:
- return super()._get_est(**params)
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- dist = CustomDict()
- if self.goal == "class":
- dist["penalty"] = Cat(["l1", "l2"])
- dist["loss"] = Cat(["hinge", "squared_hinge"])
- else:
- dist["loss"] = Cat(["epsilon_insensitive", "squared_epsilon_insensitive"])
-
- dist["C"] = Float(1e-3, 100, log=True)
- dist["dual"] = Cat([True, False])
-
- if self.engine["estimator"] == "cuml":
- dist.pop("dual")
-
- return dist
-
-
-class LogisticRegression(ClassRegModel):
- """Logistic Regression.
-
- Logistic regression, despite its name, is a linear model for
- classification rather than regression. Logistic regression is also
- known in the literature as logit regression, maximum-entropy
- classification (MaxEnt) or the log-linear classifier. In this model,
- the probabilities describing the possible outcomes of a single trial
- are modeled using a logistic function.
-
- Corresponding estimators are:
-
- - [LogisticRegression][] for classification tasks.
-
- Read more in sklearn's [documentation][lrdocs].
-
- See Also
- --------
- atom.models:GaussianProcess
- atom.models:LinearDiscriminantAnalysis
- atom.models:PassiveAggressive
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="RF", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "LR"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
-
- _module = "linear_model"
- _estimators = CustomDict({"class": "LogisticRegression"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- # Limitations on penalty + solver combinations
- penalty = self._get_param("penalty", params)
- solver = self._get_param("solver", params)
- cond_1 = penalty is None and solver == "liblinear"
- cond_2 = penalty == "l1" and solver not in ("liblinear", "saga")
- cond_3 = penalty == "elasticnet" and solver != "saga"
-
- if cond_1 or cond_2 or cond_3:
- params.replace_value("penalty", "l2") # Change to default value
-
- if self._get_param("penalty", params) != "elasticnet":
- params.pop("l1_ratio")
-
- if self._get_param("penalty", params) is None:
- params.pop("C")
-
- return params
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- dist = CustomDict(
- penalty=Cat([None, "l1", "l2", "elasticnet"]),
- C=Float(1e-3, 100, log=True),
- solver=Cat(["lbfgs", "newton-cg", "liblinear", "sag", "saga"]),
- max_iter=Int(100, 1000, step=10),
- l1_ratio=Float(0, 1.0, step=0.1),
- )
-
- if self._gpu:
- dist.pop("solver")
- dist.pop("penalty") # Only 'l2' is supported
- elif self.engine["estimator"] == "sklearnex":
- dist["solver"] = Cat(["lbfgs", "newton-cg"])
-
- return dist
-
-
-class MultiLayerPerceptron(ClassRegModel):
- """Multi-layer Perceptron.
-
- Multi-layer Perceptron is a supervised learning algorithm that
- learns a function by training on a dataset. Given a set of features
- and a target, it can learn a non-linear function approximator for
- either classification or regression. It is different from logistic
- regression, in that between the input and the output layer, there
- can be one or more non-linear layers, called hidden layers.
-
- Corresponding estimators are:
-
- - [MLPClassifier][] for classification tasks.
- - [MLPRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][mlpdocs].
-
- See Also
- --------
- atom.models:PassiveAggressive
- atom.models:Perceptron
- atom.models:StochasticGradientDescent
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="MLP", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "MLP"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = True
- native_multioutput = False
- has_validation = "max_iter"
- supports_engines = ["sklearn"]
-
- _module = "neural_network"
- _estimators = CustomDict({"class": "MLPClassifier", "reg": "MLPRegressor"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- # Drop layers when a previous layer has 0 neurons
- drop = False
- for param in [p for p in sorted(params) if p.startswith("hidden_layer")]:
- if params[param] == 0 or drop:
- drop = True
- params.pop(param)
-
- if self._get_param("solver", params) != "sgd":
- params.pop("learning_rate")
- params.pop("power_t")
- else:
- params.pop("learning_rate_init")
-
- return params
-
- def _trial_to_est(self, params: CustomDict) -> CustomDict:
- """Convert trial's hyperparameters to parameters for the estimator.
-
- Parameters
- ----------
- params: CustomDict
- Trial's hyperparameters.
-
- Returns
- -------
- CustomDict
- Estimator's hyperparameters.
-
- """
- params = super()._trial_to_est(params)
-
- hidden_layer_sizes = []
- for param in [p for p in sorted(params) if p.startswith("hidden_layer")]:
- hidden_layer_sizes.append(params.pop(param))
-
- if hidden_layer_sizes:
- params.insert(0, "hidden_layer_sizes", tuple(hidden_layer_sizes))
-
- return params
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- dist = CustomDict(
- hidden_layer_1=Int(10, 100),
- hidden_layer_2=Int(0, 100),
- hidden_layer_3=Int(0, 10),
- activation=Cat(["identity", "logistic", "tanh", "relu"]),
- solver=Cat(["lbfgs", "sgd", "adam"]),
- alpha=Float(1e-4, 0.1, log=True),
- batch_size=Cat(["auto", 8, 16, 32, 64, 128, 256]),
- learning_rate=Cat(["constant", "invscaling", "adaptive"]),
- learning_rate_init=Float(1e-3, 0.1, log=True),
- power_t=Float(0.1, 0.9, step=0.1),
- max_iter=Int(50, 500, step=10),
- )
-
- # Drop layers if sizes are specified by user
- return dist[3:] if "hidden_layer_sizes" in self._est_params else dist
-
-
-class MultinomialNB(ClassRegModel):
- """Multinomial Naive Bayes.
-
- MultinomialNB implements the Naive Bayes algorithm for multinomially
- distributed data, and is one of the two classic Naive Bayes variants
- used in text classification (where the data are typically
- represented as word vector counts, although tf-idf vectors are also
- known to work well in practice).
-
- Corresponding estimators are:
-
- - [MultinomialNB][multinomialnbclass] for classification tasks.
-
- Read more in sklearn's [documentation][mnbdocs].
-
- See Also
- --------
- atom.models:BernoulliNB
- atom.models:ComplementNB
- atom.models:GaussianNB
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="MNB", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "MNB"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "cuml"]
-
- _module = "naive_bayes"
- _estimators = CustomDict({"class": "MultinomialNB"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- alpha=Float(0.01, 10, log=True),
- fit_prior=Cat([True, False]),
- )
-
-
-class OrdinaryLeastSquares(ClassRegModel):
- """Linear Regression.
-
- Ordinary Least Squares is just linear regression without any
- regularization. It fits a linear model with coefficients `w=(w1,
- ..., wp)` to minimize the residual sum of squares between the
- observed targets in the dataset, and the targets predicted by the
- linear approximation.
-
- Corresponding estimators are:
-
- - [LinearRegression][] for regression tasks.
-
- Read more in sklearn's [documentation][olsdocs].
-
- See Also
- --------
- atom.models:ElasticNet
- atom.models:Lasso
- atom.models:Ridge
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import fetch_california_housing
-
- X, y = fetch_california_housing(return_X_y=True)
-
- atom = ATOMRegressor(X, y, random_state=1)
- atom.run(models="OLS", metric="r2", verbose=2)
- ```
-
- """
-
- acronym = "OLS"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
-
- _module = "linear_model"
- _estimators = CustomDict({"reg": "LinearRegression"})
-
-
-class OrthogonalMatchingPursuit(ClassRegModel):
- """Orthogonal Matching Pursuit.
-
- Orthogonal Matching Pursuit implements the OMP algorithm for
- approximating the fit of a linear model with constraints imposed
- on the number of non-zero coefficients.
-
- Corresponding estimators are:
-
- - [OrthogonalMatchingPursuit][] for regression tasks.
-
- Read more in sklearn's [documentation][ompdocs].
-
- See Also
- --------
- atom.models:Lasso
- atom.models:LeastAngleRegression
- atom.models:OrdinaryLeastSquares
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import fetch_california_housing
-
- X, y = fetch_california_housing(return_X_y=True)
-
- atom = ATOMRegressor(X, y, random_state=1)
- atom.run(models="OMP", metric="r2", verbose=2)
- ```
-
- """
-
- acronym = "OMP"
- needs_scaling = True
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "linear_model"
- _estimators = CustomDict({"reg": "OrthogonalMatchingPursuit"})
-
-
-class PassiveAggressive(ClassRegModel):
- """Passive Aggressive.
-
- The passive-aggressive algorithms are a family of algorithms for
- large-scale learning. They are similar to the Perceptron in that
- they do not require a learning rate. However, contrary to the
- [Perceptron][], they include a regularization parameter `C`.
-
- Corresponding estimators are:
-
- - [PassiveAggressiveClassifier][] for classification tasks.
- - [PassiveAggressiveRegressor][] for classification tasks.
-
- Read more in sklearn's [documentation][padocs].
-
- See Also
- --------
- atom.models:MultiLayerPerceptron
- atom.models:Perceptron
- atom.models:StochasticGradientDescent
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="PA", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "PA"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = "max_iter"
- supports_engines = ["sklearn"]
-
- _module = "linear_model"
- _estimators = CustomDict(
- {"class": "PassiveAggressiveClassifier", "reg": "PassiveAggressiveRegressor"}
- )
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- if self.goal == "class":
- loss = ["hinge", "squared_hinge"]
- else:
- loss = ["epsilon_insensitive", "squared_epsilon_insensitive"]
-
- return CustomDict(
- C=Float(1e-3, 100, log=True),
- max_iter=Int(500, 1500, step=50),
- loss=Cat(loss),
- average=Cat([True, False]),
- )
-
-
-class Perceptron(ClassRegModel):
- """Linear Perceptron classification.
-
- The Perceptron is a simple classification algorithm suitable for
- large scale learning. By default:
-
- * It does not require a learning rate.
- * It is not regularized (penalized).
- * It updates its model only on mistakes.
-
- The last characteristic implies that the Perceptron is slightly
- faster to train than [StochasticGradientDescent][] with the hinge
- loss and that the resulting models are sparser.
-
- Corresponding estimators are:
-
- - [Perceptron][percclassifier] for classification tasks.
-
- Read more in sklearn's [documentation][percdocs].
-
- See Also
- --------
- atom.models:MultiLayerPerceptron
- atom.models:PassiveAggressive
- atom.models:StochasticGradientDescent
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="Perc", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "Perc"
- needs_scaling = True
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = "max_iter"
- supports_engines = ["sklearn"]
-
- _module = "linear_model"
- _estimators = CustomDict({"class": "Perceptron"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if self._get_param("penalty", params) != "elasticnet":
- params.pop("l1_ratio")
-
- return params
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- penalty=Cat([None, "l2", "l1", "elasticnet"]),
- alpha=Float(1e-4, 10, log=True),
- l1_ratio=Float(0.1, 0.9, step=0.1),
- max_iter=Int(500, 1500, step=50),
- eta0=Float(1e-2, 10, log=True),
- )
-
-
-class QuadraticDiscriminantAnalysis(ClassRegModel):
- """Quadratic Discriminant Analysis.
-
- Quadratic Discriminant Analysis is a classifier with a quadratic
- decision boundary, generated by fitting class conditional densities
- to the data and using Bayes’ rule. The model fits a Gaussian
- density to each class, assuming that all classes share the same
- covariance matrix.
-
- Corresponding estimators are:
-
- - [QuadraticDiscriminantAnalysis][qdaclassifier] for classification tasks.
-
- Read more in sklearn's [documentation][ldadocs].
-
- See Also
- --------
- atom.models:LinearDiscriminantAnalysis
- atom.models:LogisticRegression
- atom.models:RadiusNearestNeighbors
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="QDA", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "QDA"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "discriminant_analysis"
- _estimators = CustomDict({"class": "QuadraticDiscriminantAnalysis"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(reg_param=Float(0, 1.0, step=0.1))
-
-
-class RadiusNearestNeighbors(ClassRegModel):
- """Radius Nearest Neighbors.
-
- Radius Nearest Neighbors implements the nearest neighbors vote,
- where the neighbors are selected from within a given radius. For
- regression, the target is predicted by local interpolation of the
- targets associated of the nearest neighbors in the training set.
-
- !!! warning
- * The `radius` parameter should be tuned to the data at hand or
- the model will perform poorly.
- * If outliers are detected, the estimator raises an exception
- unless `est_params={"outlier_label": "most_frequent"}` is used.
-
- Corresponding estimators are:
-
- - [RadiusNeighborsClassifier][] for classification tasks.
- - [RadiusNeighborsRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][knndocs].
-
- See Also
- --------
- atom.models:KNearestNeighbors
- atom.models:LinearDiscriminantAnalysis
- atom.models:QuadraticDiscriminantAnalysis
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(
- models="RNN",
- metric="f1",
- est_params={"outlier_label": "most_frequent"},
- verbose=2,
- )
- ```
-
- """
-
- acronym = "RNN"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = True
- native_multioutput = True
- has_validation = None
- supports_engines = ["sklearn"]
-
- _module = "neighbors"
- _estimators = CustomDict(
- {"class": "RadiusNeighborsClassifier", "reg": "RadiusNeighborsRegressor"}
- )
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- radius=Float(1e-2, 100),
- weights=Cat(["uniform", "distance"]),
- algorithm=Cat(["auto", "ball_tree", "kd_tree", "brute"]),
- leaf_size=Int(20, 40),
- p=Int(1, 2),
- )
-
-
-class RandomForest(ClassRegModel):
- """Random Forest.
-
- Random forests are an ensemble learning method that operate by
- constructing a multitude of decision trees at training time and
- outputting the class that is the mode of the classes
- (classification) or mean prediction (regression) of the individual
- trees. Random forests correct for decision trees' habit of
- overfitting to their training set.
-
- Corresponding estimators are:
-
- - [RandomForestClassifier][] for classification tasks.
- - [RandomForestRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][adabdocs].
-
- !!! warning
- cuML's implementation of [RandomForestClassifier][cumlrf] only
- supports predictions on dtype `float32`. Convert all dtypes
- before calling atom's [run][atomclassifier-run] method to avoid
- exceptions.
-
- See Also
- --------
- atom.models:DecisionTree
- atom.models:ExtraTrees
- atom.models:HistGradientBoosting
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="RF", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "RF"
- needs_scaling = False
- accepts_sparse = True
- native_multilabel = True
- native_multioutput = True
- has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
-
- _module = "ensemble"
- _estimators = CustomDict(
- {"class": "RandomForestClassifier", "reg": "RandomForestRegressor"}
- )
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if not self._get_param("bootstrap", params):
- params.pop("max_samples")
-
- return params
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- if self.goal == "class":
- criterion = ["gini", "entropy"]
- else:
- if self.engine["estimator"] == "cuml":
- criterion = ["mse", "poisson", "gamma", "inverse_gaussian"]
- else:
- criterion = ["squared_error", "absolute_error", "poisson"]
-
- dist = CustomDict(
- n_estimators=Int(10, 500, step=10),
- criterion=Cat(criterion),
- max_depth=Cat([None, *range(1, 17)]),
- min_samples_split=Int(2, 20),
- min_samples_leaf=Int(1, 20),
- max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
- bootstrap=Cat([True, False]),
- max_samples=Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]),
- ccp_alpha=Float(0, 0.035, step=0.005),
- )
-
- if self.engine["estimator"] == "sklearnex":
- dist.pop("criterion")
- dist.pop("ccp_alpha")
- elif self.engine["estimator"] == "cuml":
- dist.replace_key("criterion", "split_criterion")
- dist["max_depth"] = Int(1, 17)
- dist["max_features"] = Cat(["sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9])
- dist["max_samples"] = Float(0.5, 0.9, step=0.1)
- dist.pop("ccp_alpha")
-
- return dist
-
-
-class Ridge(ClassRegModel):
- """Linear least squares with l2 regularization.
-
- If classifier, it first converts the target values into {-1, 1}
- and then treats the problem as a regression task.
-
- Corresponding estimators are:
-
- - [RidgeClassifier][] for classification tasks.
- - [Ridge][ridgeregressor] for regression tasks.
-
- Read more in sklearn's [documentation][ridgedocs].
-
- !!! warning
- Engines `sklearnex` and `cuml` are only available for regression
- tasks.
-
- See Also
- --------
- atom.models:BayesianRidge
- atom.models:ElasticNet
- atom.models:Lasso
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import fetch_california_housing
-
- X, y = fetch_california_housing(return_X_y=True)
-
- atom = ATOMRegressor(X, y, random_state=1)
- atom.run(models="Ridge", metric="r2", verbose=2)
- ```
-
- """
-
- acronym = "Ridge"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = True
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
-
- _module = "linear_model"
- _estimators = CustomDict({"class": "RidgeClassifier", "reg": "Ridge"})
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- dist = CustomDict(
- alpha=Float(1e-3, 10, log=True),
- solver=Cat(["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
- )
-
- if self.goal == "reg":
- if self.engine["estimator"] == "sklearnex":
- dist.pop("solver") # Only supports 'auto'
- elif self.engine["estimator"] == "cuml":
- dist["solver"] = Cat(["eig", "svd", "cd"])
-
- return dist
-
-
-class StochasticGradientDescent(ClassRegModel):
- """Stochastic Gradient Descent.
-
- Stochastic Gradient Descent is a simple yet very efficient approach
- to fitting linear classifiers and regressors under convex loss
- functions. Even though SGD has been around in the machine learning
- community for a long time, it has received a considerable amount of
- attention just recently in the context of large-scale learning.
-
- Corresponding estimators are:
-
- - [SGDClassifier][] for classification tasks.
- - [SGDRegressor][] for regression tasks.
-
- Read more in sklearn's [documentation][sgddocs].
-
- See Also
- --------
- atom.models:MultiLayerPerceptron
- atom.models:PassiveAggressive
- atom.models:SupportVectorMachine
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="SGD", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "SGD"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = "max_iter"
- supports_engines = ["sklearn"]
-
- _module = "linear_model"
- _estimators = CustomDict({"class": "SGDClassifier", "reg": "SGDRegressor"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if self._get_param("penalty", params) != "elasticnet":
- params.pop("l1_ratio")
-
- if self._get_param("learning_rate", params) == "optimal":
- params.pop("eta0")
-
- return params
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- loss = [
- "hinge",
- "log_loss",
- "modified_huber",
- "squared_hinge",
- "perceptron",
- "squared_error",
- "huber",
- "epsilon_insensitive",
- "squared_epsilon_insensitive",
- ]
-
- return CustomDict(
- loss=Cat(loss if self.goal == "class" else loss[-4:]),
- penalty=Cat([None, "l1", "l2", "elasticnet"]),
- alpha=Float(1e-4, 1.0, log=True),
- l1_ratio=Float(0.1, 0.9, step=0.1),
- max_iter=Int(500, 1500, step=50),
- epsilon=Float(1e-4, 1.0, log=True),
- learning_rate=Cat(["constant", "invscaling", "optimal", "adaptive"]),
- eta0=Float(1e-2, 10, log=True),
- power_t=Float(0.1, 0.9, step=0.1),
- average=Cat([True, False]),
- )
-
-
-class SupportVectorMachine(ClassRegModel):
- """Support Vector Machine.
-
- The implementation of the Support Vector Machine is based on libsvm.
- The fit time scales at least quadratically with the number of
- samples and may be impractical beyond tens of thousands of samples.
- For large datasets consider using a [LinearSVM][] or a
- [StochasticGradientDescent][] model instead.
-
- Corresponding estimators are:
-
- - [SVC][] for classification tasks.
- - [SVR][] for classification tasks.
-
- Read more in sklearn's [documentation][svmdocs].
-
- See Also
- --------
- atom.models:LinearSVM
- atom.models:MultiLayerPerceptron
- atom.models:StochasticGradientDescent
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="SVM", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "SVM"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = None
- supports_engines = ["sklearn", "sklearnex", "cuml"]
-
- _module = "svm"
- _estimators = CustomDict({"class": "SVC", "reg": "SVR"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if self.goal == "class":
- params.pop("epsilon")
-
- kernel = self._get_param("kernel", params)
- if kernel == "poly":
- params.replace_value("gamma", "scale") # Crashes in combination with "auto"
- else:
- params.pop("degree")
-
- if kernel not in ("rbf", "poly", "sigmoid"):
- params.pop("gamma")
-
- if kernel not in ("poly", "sigmoid"):
- params.pop("coef0")
-
- return params
-
- def _get_est(self, **params) -> PREDICTOR:
- """Get the model's estimator with unpacked parameters.
-
- Returns
- -------
- Predictor
- Estimator instance.
-
- """
- if self.engine["estimator"] == "cuml" and self.goal == "class":
- return self._est_class(
- probability=params.pop("probability", True),
- random_state=params.pop("random_state", self.random_state),
- **params)
- else:
- return super()._get_est(**params)
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- dist = CustomDict(
- C=Float(1e-3, 100, log=True),
- kernel=Cat(["linear", "poly", "rbf", "sigmoid"]),
- degree=Int(2, 5),
- gamma=Cat(["scale", "auto"]),
- coef0=Float(-1.0, 1.0),
- epsilon=Float(1e-3, 100, log=True),
- shrinking=Cat([True, False]),
- )
-
- if self.engine["estimator"] == "cuml":
- dist.pop("epsilon")
- dist.pop("shrinking")
-
- return dist
-
-
-class XGBoost(ClassRegModel):
- """Extreme Gradient Boosting.
-
- XGBoost is an optimized distributed gradient boosting model
- designed to be highly efficient, flexible and portable. XGBoost
- provides a parallel tree boosting that solve many data science
- problems in a fast and accurate way.
-
- Corresponding estimators are:
-
- - [XGBClassifier][] for classification tasks.
- - [XGBRegressor][] for regression tasks.
-
- Read more in XGBoost's [documentation][xgbdocs].
-
- See Also
- --------
- atom.models:CatBoost
- atom.models:GradientBoostingMachine
- atom.models:LightGBM
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(models="XGB", metric="f1", verbose=2)
- ```
-
- """
-
- acronym = "XGB"
- needs_scaling = True
- accepts_sparse = True
- native_multilabel = False
- native_multioutput = False
- has_validation = "n_estimators"
- supports_engines = ["xgboost"]
-
- _module = "xgboost"
- _estimators = CustomDict({"class": "XGBClassifier", "reg": "XGBRegressor"})
-
- def _get_est(self, **params) -> PREDICTOR:
- """Get the model's estimator with unpacked parameters.
-
- Returns
- -------
- Predictor
- Estimator instance.
-
- """
- eval_metric = None
- if getattr(self, "_metric", None):
- eval_metric = XGBMetric(self._metric[0], task=self.task)
-
- return self._est_class(
- eval_metric=params.pop("eval_metric", eval_metric),
- n_jobs=params.pop("n_jobs", self.n_jobs),
- tree_method=params.pop("tree_method", "gpu_hist" if self._gpu else None),
- gpu_id=self._device_id,
- verbosity=params.pop("verbosity", 0),
- random_state=params.pop("random_state", self.random_state),
- **params,
- )
-
- def _fit_estimator(
- self,
- estimator: PREDICTOR,
- data: tuple[DATAFRAME, SERIES],
- est_params_fit: dict,
- validation: tuple[DATAFRAME, SERIES] | None = None,
- trial: Trial | None = None,
- ):
- """Fit the estimator and perform in-training validation.
-
- Parameters
- ----------
- estimator: Predictor
- Instance to fit.
-
- data: tuple
- Training data of the form (X, y).
-
- est_params_fit: dict
- Additional parameters for the estimator's fit method.
-
- validation: tuple or None
- Validation data of the form (X, y). If None, no validation
- is performed.
-
- trial: [Trial][] or None
- Active trial (during hyperparameter tuning).
-
- Returns
- -------
- Predictor
- Fitted instance.
-
- """
- m = self._metric[0].name
- params = est_params_fit.copy()
-
- callbacks = params.pop("callbacks", [])
- if trial and len(self._metric) == 1:
- callbacks.append(XGBoostPruningCallback(trial, f"validation_1-{m}"))
-
- try:
- estimator.set_params(callbacks=callbacks)
- estimator.fit(
- *data,
- eval_set=[data, validation] if validation else None,
- verbose=params.get("verbose", False),
- **params,
- )
- except TrialPruned as ex:
- # Add the pruned step to the output
- step = str(ex).split(" ")[-1][:-1]
- steps = estimator.get_params()[self.has_validation]
- trial.params[self.has_validation] = f"{step}/{steps}"
-
- trial.set_user_attr("estimator", estimator)
- raise ex
-
- if validation:
- # Create evals attribute with train and validation scores
- # Negative because minimizes the function
- results = estimator.evals_result()
- self._evals[f"{m}_train"] = np.negative(results["validation_0"][m])
- self._evals[f"{m}_test"] = np.negative(results["validation_1"][m])
-
- return estimator
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- n_estimators=Int(20, 500, step=10),
- learning_rate=Float(0.01, 1.0, log=True),
- max_depth=Int(1, 20),
- gamma=Float(0, 1.0),
- min_child_weight=Int(1, 10),
- subsample=Float(0.5, 1.0, step=0.1),
- colsample_bytree=Float(0.4, 1.0, step=0.1),
- reg_alpha=Float(1e-4, 100, log=True),
- reg_lambda=Float(1e-4, 100, log=True),
- )
-
-
-# Time series ====================================================== >>
-
-class ARIMA(ForecastModel):
- """Autoregressive Integrated Moving Average Model.
-
- Seasonal ARIMA models and exogeneous input is supported, hence this
- estimator is capable of fitting SARIMA, ARIMAX, and SARIMAX.
-
- An ARIMA model, is a generalization of an autoregressive moving
- average (ARMA) model, and is fitted to time-series data in an effort
- to forecast future points. ARIMA models can be especially
- efficacious in cases where data shows evidence of non-stationarity.
-
- The "AR" part of ARIMA indicates that the evolving variable of
- interest is regressed on its own lagged (i.e., prior observed)
- values. The "MA" part indicates that the regression error is
- actually a linear combination of error terms whose values occurred
- contemporaneously and at various times in the past. The "I" (for
- "integrated") indicates that the data values have been replaced with
- the difference between their values and the previous values (and this
- differencing process may have been performed more than once).
-
- Corresponding estimators are:
-
- - [ARIMA][arimaclass] for forecasting tasks.
-
- !!! warning
- ARIMA often runs into numerical errors when optimizing the
- hyperparameters. Possible solutions are:
-
- - Use the [AutoARIMA][] model instead.
- - Use [`est_params`][directforecaster-est_params] to specify the
- orders manually, e.g. `#!python atom.run("arima", n_trials=5,
- est_params={"order": (1, 1, 0)})`.
- - Use the `catch` parameter in [`ht_params`][directforecaster-ht_params]
- to avoid raising every exception, e.g. `#!python atom.run("arima",
- n_trials=5, ht_params={"catch": (Exception,)})`.
-
- See Also
- --------
- atom.models:AutoARIMA
-
- Examples
- --------
- ```pycon
- from atom import ATOMForecaster
- from sktime.datasets import load_longley
-
- _, X = load_longley()
-
- atom = ATOMForecaster(X)
- atom.run(models="ARIMA", verbose=2)
- ```
-
- """
-
- acronym = "ARIMA"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = True
- has_validation = None
- supports_engines = ["sktime"]
-
- _module = "sktime.forecasting.arima"
- _estimators = CustomDict({"fc": "ARIMA"})
-
- _order = ("p", "d", "q")
- _sorder = ("Ps", "Ds", "Qs", "S")
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- # If no seasonal periodicity, set seasonal components to zero
- if self._get_param("S", params) == 0:
- for p in self._sorder:
- params.replace_value(p, 0)
-
- return params
-
- def _trial_to_est(self, params: CustomDict) -> CustomDict:
- """Convert trial's hyperparameters to parameters for the estimator.
-
- Parameters
- ----------
- params: CustomDict
- Trial's hyperparameters.
-
- Returns
- -------
- CustomDict
- Estimator's hyperparameters.
-
- """
- params = super()._trial_to_est(params)
-
- # Convert params to hyperparameters order and seasonal_order
- if all(p in params for p in self._sorder):
- params.insert(0, "seasonal_order", tuple(params.pop(p) for p in self._sorder))
- if all(p in params for p in self._order):
- params.insert(0, "order", tuple(params.pop(p) for p in self._order))
-
- return params
-
- def _get_distributions(self) -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- methods = ["newton", "nm", "bfgs", "lbfgs", "powell", "cg", "ncg", "basinhopping"]
-
- dist = CustomDict(
- p=Int(0, 2),
- d=Int(0, 1),
- q=Int(0, 2),
- Ps=Int(0, 2),
- Ds=Int(0, 1),
- Qs=Int(0, 2),
- S=Cat([0, 4, 6, 7, 12]),
- method=Cat(methods),
- maxiter=Int(50, 200, step=10),
- with_intercept=Cat([True, False]),
- )
-
- # Drop order and seasonal_order params if specified by user
- if "order" in self._est_params:
- for p in self._order:
- dist.pop(p)
- if "seasonal_order" in self._est_params:
- for p in self._sorder:
- dist.pop(p)
-
- return dist
-
-
-class AutoARIMA(ForecastModel):
- """Automatic Autoregressive Integrated Moving Average Model.
-
- [ARIMA][] implementation that includes automated fitting of
- (S)ARIMA(X) hyperparameters (p, d, q, P, D, Q). The AutoARIMA
- algorithm seeks to identify the most optimal parameters for an
- ARIMA model, settling on a single fitted ARIMA model. This process
- is based on the commonly-used R function.
-
- AutoARIMA works by conducting differencing tests (i.e.,
- Kwiatkowski–Phillips–Schmidt–Shin, Augmented Dickey-Fuller or
- Phillips–Perron) to determine the order of differencing, d, and
- then fitting models within defined ranges. AutoARIMA also seeks
- to identify the optimal P and Q hyperparameters after conducting
- the Canova-Hansen to determine the optimal order of seasonal
- differencing.
-
- Note that due to stationarity issues, AutoARIMA might not find a
- suitable model that will converge. If this is the case, a ValueError
- is thrown suggesting stationarity-inducing measures be taken prior
- to re-fitting or that a new range of order values be selected.
-
- Corresponding estimators are:
-
- - [AutoARIMA][autoarimaclass] for forecasting tasks.
-
- See Also
- --------
- atom.models:ARIMA
- atom.models:ETS
-
- Examples
- --------
- ```pycon
- from atom import ATOMForecaster
- from sktime.datasets import load_longley
-
- _, X = load_longley()
-
- atom = ATOMForecaster(X, random_state=1)
- atom.run(models="autoarima", verbose=2)
- ```
-
- """
-
- acronym = "AutoARIMA"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = True
- has_validation = None
- supports_engines = ["sktime"]
-
- _module = "sktime.forecasting.arima"
- _estimators = CustomDict({"fc": "AutoARIMA"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- methods = ["newton", "nm", "bfgs", "lbfgs", "powell", "cg", "ncg", "basinhopping"]
-
- return CustomDict(
- method=Cat(methods),
- maxiter=Int(50, 200, step=10),
- with_intercept=Cat([True, False]),
- )
-
-
-class ExponentialSmoothing(ForecastModel):
- """Exponential Smoothing forecaster.
-
- Holt-Winters exponential smoothing forecaster. The default settings
- use simple exponential smoothing, without trend and seasonality
- components.
-
- Corresponding estimators are:
-
- - [ExponentialSmoothing][esclass] for forecasting tasks.
-
- See Also
- --------
- atom.models:ARIMA
- atom.models:ETS
- atom.models:PolynomialTrend
-
- Examples
- --------
- ```pycon
- from atom import ATOMForecaster
- from sktime.datasets import load_airline
-
- y = load_airline()
-
- atom = ATOMForecaster(y, random_state=1)
- atom.run(models="ES", verbose=2)
- ```
-
- """
-
- acronym = "ES"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = True
- has_validation = None
- supports_engines = ["sktime"]
-
- _module = "sktime.forecasting.exp_smoothing"
- _estimators = CustomDict({"fc": "ExponentialSmoothing"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- if self._get_param("trend", params) is None:
- params.pop("damped_trend")
-
- if self._get_param("sp", params) is None:
- params.pop("seasonal")
-
- return params
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- methods = ["L-BFGS-B", "TNC", "SLSQP", "Powell", "trust-constr", "bh", "ls"]
-
- return CustomDict(
- trend=Cat(["add", "mul", None]),
- damped_trend=Cat([True, False]),
- seasonal=Cat(["add", "mul", None]),
- sp=Cat([4, 6, 7, 12, None]),
- use_boxcox=Cat([True, False]),
- initialization_method=Cat(["estimated", "heuristic"]),
- method=Cat(methods),
- )
-
-
-class ETS(ForecastModel):
- """ETS model with automatic fitting capabilities.
-
- The ETS models are a family of time series models with an
- underlying state space model consisting of a level component,
- a trend component (T), a seasonal component (S), and an error
- term (E).
-
- Corresponding estimators are:
-
- - [AutoETS][] for forecasting tasks.
-
- See Also
- --------
- atom.models:ARIMA
- atom.models:ExponentialSmoothing
- atom.models:PolynomialTrend
-
- Examples
- --------
- ```pycon
- from atom import ATOMForecaster
- from sktime.datasets import load_airline
-
- y = load_airline()
-
- atom = ATOMForecaster(y, random_state=1)
- atom.run(models="ETS", verbose=2)
-
- ```
-
- """
-
- acronym = "ETS"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = True
- has_validation = None
- supports_engines = ["sktime"]
-
- _module = "sktime.forecasting.ets"
- _estimators = CustomDict({"fc": "AutoETS"})
-
- def _get_parameters(self, trial: Trial) -> CustomDict:
- """Get the trial's hyperparameters.
-
- Parameters
- ----------
- trial: [Trial][]
- Current trial.
-
- Returns
- -------
- CustomDict
- Trial's hyperparameters.
-
- """
- params = super()._get_parameters(trial)
-
- # If no seasonal periodicity, set seasonal components to zero
- if self._get_param("sp", params) == 1:
- params.pop("seasonal")
-
- return params
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- error=Cat(["add", "mul"]),
- trend=Cat(["add", "mul", None]),
- damped_trend=Cat([True, False]),
- seasonal=Cat(["add", "mul", None]),
- sp=Cat([1, 4, 6, 7, 12]),
- initialization_method=Cat(["estimated", "heuristic"]),
- maxiter=Int(500, 2000, step=100),
- auto=Cat([True, False]),
- information_criterion=Cat(["aic", "bic", "aicc"]),
- )
-
-
-class NaiveForecaster(ForecastModel):
- """Naive Forecaster.
-
- NaiveForecaster is a dummy forecaster that makes forecasts using
- simple strategies based on naive assumptions about past trends
- continuing. When used in [multivariate][] tasks, each column is
- forecasted with the same strategy.
-
- Corresponding estimators are:
-
- - [NaiveForecaster][naiveforecasterclass] for forecasting tasks.
-
- See Also
- --------
- atom.models:ExponentialSmoothing
- atom.models:Dummy
- atom.models:PolynomialTrend
-
- Examples
- --------
- ```pycon
- from atom import ATOMForecaster
- from sktime.datasets import load_airline
-
- y = load_airline()
-
- atom = ATOMForecaster(y, random_state=1)
- atom.run(models="NF", verbose=2)
-
- ```
-
- """
-
- acronym = "NF"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = True
- has_validation = None
- supports_engines = ["sktime"]
-
- _module = "sktime.forecasting.naive"
- _estimators = CustomDict({"fc": "NaiveForecaster"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(strategy=Cat(["last", "mean", "drift"]))
-
-
-class PolynomialTrend(ForecastModel):
- """Polynomial Trend forecaster.
-
- Forecast time series data with a polynomial trend, using a sklearn
- [LinearRegression][] class to regress values of time series on
- index, after extraction of polynomial features.
-
- Corresponding estimators are:
-
- - [PolynomialTrendForecaster][] for forecasting tasks.
-
- See Also
- --------
- atom.models:ARIMA
- atom.models:ETS
- atom.models:NaiveForecaster
-
- Examples
- --------
- ```pycon
- from atom import ATOMForecaster
- from sktime.datasets import load_airline
-
- y = load_airline()
-
- atom = ATOMForecaster(y, random_state=1)
- atom.run(models="PT", verbose=2)
- ```
-
- """
-
- acronym = "PT"
- needs_scaling = False
- accepts_sparse = False
- native_multilabel = False
- native_multioutput = True
- has_validation = None
- supports_engines = ["sktime"]
-
- _module = "sktime.forecasting.trend"
- _estimators = CustomDict({"fc": "PolynomialTrendForecaster"})
-
- @staticmethod
- def _get_distributions() -> CustomDict:
- """Get the predefined hyperparameter distributions.
-
- Returns
- -------
- CustomDict
- Hyperparameter distributions.
-
- """
- return CustomDict(
- degree=Int(1, 5),
- with_intercept=Cat([True, False]),
- )
-
-
-# Ensembles ======================================================== >>
-
-class Stacking(ClassRegModel):
- """Stacking ensemble.
-
- Parameters
- ----------
- models: ClassMap
- Models from which to build the ensemble.
-
- **kwargs
- Additional keyword arguments for the estimator.
-
- """
-
- acronym = "Stack"
- needs_scaling = False
- has_validation = None
- native_multilabel = False
- native_multioutput = False
- supports_engines = []
-
- _module = "atom.ensembles"
- _estimators = CustomDict({"class": "StackingClassifier", "reg": "StackingRegressor"})
-
- def __init__(self, models: ClassMap, **kwargs):
- self._models = models
- kw_model = {k: v for k, v in kwargs.items() if k in sign(ClassRegModel.__init__)}
- super().__init__(**kw_model)
- self._est_params = {k: v for k, v in kwargs.items() if k not in kw_model}
-
- def _get_est(self, **params) -> PREDICTOR:
- """Get the model's estimator with unpacked parameters.
-
- Returns
- -------
- Predictor
- Estimator instance.
-
- """
- estimators = []
- for m in self._models:
- if m.scaler:
- name = f"pipeline_{m.name}"
- est = Pipeline([("scaler", m.scaler), (m.name, m.estimator)])
- else:
- name = m.name
- est = m.estimator
-
- estimators.append((name, est))
-
- return self._est_class(
- estimators=estimators,
- n_jobs=params.pop("n_jobs", self.n_jobs),
- **params,
- )
-
-
-class Voting(ClassRegModel):
- """Voting ensemble.
-
- Parameters
- ----------
- models: ClassMap
- Models from which to build the ensemble.
-
- **kwargs
- Additional keyword arguments for the estimator.
-
- """
-
- acronym = "Vote"
- needs_scaling = False
- has_validation = None
- native_multilabel = False
- native_multioutput = False
- supports_engines = []
-
- _module = "atom.ensembles"
- _estimators = CustomDict({"class": "VotingClassifier", "reg": "VotingRegressor"})
-
- def __init__(self, models: ClassMap, **kwargs):
- self._models = models
- kw_model = {k: v for k, v in kwargs.items() if k in sign(ClassRegModel.__init__)}
- super().__init__(**kw_model)
- self._est_params = {k: v for k, v in kwargs.items() if k not in kw_model}
-
- if self._est_params.get("voting") == "soft":
- for m in self._models:
- if not hasattr(m.estimator, "predict_proba"):
- raise ValueError(
- "Invalid value for the voting parameter. If "
- "'soft', all models in the ensemble should have "
- f"a predict_proba method, got {m._fullname}."
- )
-
- def _get_est(self, **params) -> PREDICTOR:
- """Get the model's estimator with unpacked parameters.
-
- Returns
- -------
- Predictor
- Estimator instance.
-
- """
- estimators = []
- for m in self._models:
- if m.scaler:
- name = f"pipeline_{m.name}"
- est = Pipeline([("scaler", m.scaler), (m.name, m.estimator)])
- else:
- name = m.name
- est = m.estimator
-
- estimators.append((name, est))
-
- return self._est_class(
- estimators=estimators,
- n_jobs=params.pop("n_jobs", self.n_jobs),
- **params,
- )
-
-
-# Variables ======================================================== >>
-
-# Available models
-MODELS = ClassMap(
- AdaBoost,
- ARIMA,
- AutoARIMA,
- AutomaticRelevanceDetermination,
- Bagging,
- BayesianRidge,
- BernoulliNB,
- CatBoost,
- CategoricalNB,
- ComplementNB,
- DecisionTree,
- Dummy,
- ElasticNet,
- ETS,
- ExponentialSmoothing,
- ExtraTree,
- ExtraTrees,
- GaussianNB,
- GaussianProcess,
- GradientBoostingMachine,
- HuberRegression,
- HistGradientBoosting,
- KNearestNeighbors,
- Lasso,
- LeastAngleRegression,
- LightGBM,
- LinearDiscriminantAnalysis,
- LinearSVM,
- LogisticRegression,
- MultiLayerPerceptron,
- MultinomialNB,
- NaiveForecaster,
- OrdinaryLeastSquares,
- OrthogonalMatchingPursuit,
- PassiveAggressive,
- Perceptron,
- PolynomialTrend,
- QuadraticDiscriminantAnalysis,
- RadiusNearestNeighbors,
- RandomForest,
- Ridge,
- StochasticGradientDescent,
- SupportVectorMachine,
- XGBoost,
- key="acronym",
-)
-
-# Available ensembles
-ENSEMBLES = ClassMap(Stacking, Voting, key="acronym")
-
-# Available models + ensembles
-MODELS_ENSEMBLES = ClassMap(*MODELS, *ENSEMBLES, key="acronym")
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module containing classification and regression models.
+
+"""
+
+from __future__ import annotations
+
+import numpy as np
+from optuna.distributions import CategoricalDistribution as Cat
+from optuna.distributions import FloatDistribution as Float
+from optuna.distributions import IntDistribution as Int
+from optuna.exceptions import TrialPruned
+from optuna.integration import (
+ CatBoostPruningCallback, LightGBMPruningCallback, XGBoostPruningCallback,
+)
+from optuna.trial import Trial
+
+from atom.basemodel import ClassRegModel
+from atom.utils.types import DATAFRAME, PANDAS, PREDICTOR
+from atom.utils.utils import CatBMetric, CustomDict, LGBMetric, XGBMetric
+
+
+class AdaBoost(ClassRegModel):
+ """Adaptive Boosting (with decision tree as base estimator).
+
+ AdaBoost is a meta-estimator that begins by fitting a
+ classifier/regressor on the original dataset and then fits
+ additional copies of the algorithm on the same dataset but where
+ the weights of instances are adjusted according to the error of
+ the current prediction.
+
+ Corresponding estimators are:
+
+ - [AdaBoostClassifier][] for classification tasks.
+ - [AdaBoostRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][adabdocs].
+
+ See Also
+ --------
+ atom.models:GradientBoostingMachine
+ atom.models:RandomForest
+ atom.models:XGBoost
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="AdaB", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "AdaB"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "ensemble"
+ _estimators = CustomDict({"class": "AdaBoostClassifier", "reg": "AdaBoostRegressor"})
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ dist = CustomDict(
+ n_estimators=Int(50, 500, step=10),
+ learning_rate=Float(0.01, 10, log=True),
+ )
+
+ if self.goal == "class":
+ dist["algorithm"] = Cat(["SAMME.R", "SAMME"])
+ else:
+ dist["loss"] = Cat(["linear", "square", "exponential"])
+
+ return dist
+
+
+class AutomaticRelevanceDetermination(ClassRegModel):
+ """Automatic Relevance Determination.
+
+ Automatic Relevance Determination is very similar to
+ [BayesianRidge][], but can lead to sparser coefficients. Fit the
+ weights of a regression model, using an ARD prior. The weights of
+ the regression model are assumed to be in Gaussian distributions.
+
+ Corresponding estimators are:
+
+ - [ARDRegression][] for regression tasks.
+
+ Read more in sklearn's [documentation][arddocs].
+
+ See Also
+ --------
+ atom.models:BayesianRidge
+ atom.models:GaussianProcess
+ atom.models:LeastAngleRegression
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import fetch_california_housing
+
+ X, y = fetch_california_housing(return_X_y=True)
+
+ atom = ATOMRegressor(X, y, random_state=1)
+ atom.run(models="ARD", metric="r2", verbose=2)
+ ```
+
+ """
+
+ acronym = "ARD"
+ needs_scaling = True
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"reg": "ARDRegression"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ n_iter=Int(100, 1000, step=10),
+ alpha_1=Float(1e-4, 1, log=True),
+ alpha_2=Float(1e-4, 1, log=True),
+ lambda_1=Float(1e-4, 1, log=True),
+ lambda_2=Float(1e-4, 1, log=True),
+ )
+
+
+class Bagging(ClassRegModel):
+ """Bagging model (with decision tree as base estimator).
+
+ Bagging uses an ensemble meta-estimator that fits base predictors
+ on random subsets of the original dataset and then aggregate their
+ individual predictions (either by voting or by averaging) to form a
+ final prediction. Such a meta-estimator can typically be used as a
+ way to reduce the variance of a black-box estimator by introducing
+ randomization into its construction procedure and then making an
+ ensemble out of it.
+
+ Corresponding estimators are:
+
+ - [BaggingClassifier][] for classification tasks.
+ - [BaggingRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][bagdocs].
+
+ See Also
+ --------
+ atom.models:DecisionTree
+ atom.models:LogisticRegression
+ atom.models:RandomForest
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="Bag", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "Bag"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "ensemble"
+ _estimators = CustomDict({"class": "BaggingClassifier", "reg": "BaggingRegressor"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ n_estimators=Int(10, 500, step=10),
+ max_samples=Float(0.5, 1.0, step=0.1),
+ max_features=Float(0.5, 1.0, step=0.1),
+ bootstrap=Cat([True, False]),
+ bootstrap_features=Cat([True, False]),
+ )
+
+
+class BayesianRidge(ClassRegModel):
+ """Bayesian ridge regression.
+
+ Bayesian regression techniques can be used to include regularization
+ parameters in the estimation procedure: the regularization parameter
+ is not set in a hard sense but tuned to the data at hand.
+
+ Corresponding estimators are:
+
+ - [BayesianRidge][bayesianridgeclass] for regression tasks.
+
+ Read more in sklearn's [documentation][brdocs].
+
+ See Also
+ --------
+ atom.models:AutomaticRelevanceDetermination
+ atom.models:GaussianProcess
+ atom.models:LeastAngleRegression
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import fetch_california_housing
+
+ X, y = fetch_california_housing(return_X_y=True)
+
+ atom = ATOMRegressor(X, y, random_state=1)
+ atom.run(models="BR", metric="r2", verbose=2)
+ ```
+
+ """
+
+ acronym = "BR"
+ needs_scaling = True
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"reg": "BayesianRidge"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ n_iter=Int(100, 1000, step=10),
+ alpha_1=Float(1e-4, 1, log=True),
+ alpha_2=Float(1e-4, 1, log=True),
+ lambda_1=Float(1e-4, 1, log=True),
+ lambda_2=Float(1e-4, 1, log=True),
+ )
+
+
+class BernoulliNB(ClassRegModel):
+ """Bernoulli Naive Bayes.
+
+ BernoulliNB implements the Naive Bayes algorithm for multivariate
+ Bernoulli models. Like [MultinomialNB][], this classifier is
+ suitable for discrete data. The difference is that while MNB works
+ with occurrence counts, BNB is designed for binary/boolean features.
+
+ Corresponding estimators are:
+
+ - [BernoulliNB][bernoullinbclass] for classification tasks.
+
+ Read more in sklearn's [documentation][bnbdocs].
+
+ See Also
+ --------
+ atom.models:ComplementNB
+ atom.models:CategoricalNB
+ atom.models:MultinomialNB
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="BNB", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "BNB"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "cuml"]
+
+ _module = "naive_bayes"
+ _estimators = CustomDict({"class": "BernoulliNB"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ alpha=Float(0.01, 10, log=True),
+ fit_prior=Cat([True, False]),
+ )
+
+
+class CatBoost(ClassRegModel):
+ """Cat Boosting Machine.
+
+ CatBoost is a machine learning method based on gradient boosting
+ over decision trees. Main advantages of CatBoost:
+
+ - Superior quality when compared with other GBDT models on many
+ datasets.
+ - Best in class prediction speed.
+
+ Corresponding estimators are:
+
+ - [CatBoostClassifier][] for classification tasks.
+ - [CatBoostRegressor][] for regression tasks.
+
+ Read more in CatBoost's [documentation][catbdocs].
+
+ !!! warning
+ * CatBoost selects the weights achieved by the best evaluation
+ on the test set after training. This means that, by default,
+ there is some minor data leakage in the test set. Use the
+ `use_best_model=False` parameter to avoid this behavior or use
+ a [holdout set][data-sets] to evaluate the final estimator.
+ * [In-training validation][] and [pruning][] are disabled when
+ `#!python device="gpu"`.
+
+ !!! note
+ ATOM uses CatBoost's `n_estimators` parameter instead of
+ `iterations` to indicate the number of trees to fit. This is
+ done to have consistent naming with the [XGBoost][] and
+ [LightGBM][] models.
+
+ See Also
+ --------
+ atom.models:GradientBoostingMachine
+ atom.models:LightGBM
+ atom.models:XGBoost
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="CatB", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "CatB"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = "n_estimators"
+ supports_engines = ["catboost"]
+
+ _module = "catboost"
+ _estimators = CustomDict({"class": "CatBoostClassifier", "reg": "CatBoostRegressor"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if self._get_param("bootstrap_type", params) == "Bernoulli":
+ params.pop("bagging_temperature")
+ elif self._get_param("bootstrap_type", params) == "Bayesian":
+ params.pop("subsample")
+
+ return params
+
+ def _get_est(self, **params) -> PREDICTOR:
+ """Get the estimator instance.
+
+ Parameters
+ ----------
+ **params
+ Unpacked hyperparameters for the estimator.
+
+ Returns
+ -------
+ Predictor
+ Estimator instance.
+
+ """
+ eval_metric = None
+ if getattr(self, "_metric", None) and not self._gpu:
+ eval_metric = CatBMetric(self._metric[0], task=self.task)
+
+ return self._est_class(
+ eval_metric=params.pop("eval_metric", eval_metric),
+ train_dir=params.pop("train_dir", ""),
+ allow_writing_files=params.pop("allow_writing_files", False),
+ thread_count=params.pop("n_jobs", self.n_jobs),
+ task_type=params.pop("task_type", "GPU" if self._gpu else "CPU"),
+ devices=str(self._device_id),
+ verbose=params.pop("verbose", False),
+ random_state=params.pop("random_state", self.random_state),
+ **params,
+ )
+
+ def _fit_estimator(
+ self,
+ estimator: PREDICTOR,
+ data: tuple[DATAFRAME, PANDAS],
+ est_params_fit: dict,
+ validation: tuple[DATAFRAME, PANDAS] | None = None,
+ trial: Trial | None = None,
+ ):
+ """Fit the estimator and perform in-training validation.
+
+ Parameters
+ ----------
+ estimator: Predictor
+ Instance to fit.
+
+ data: tuple
+ Training data of the form (X, y).
+
+ est_params_fit: dict
+ Additional parameters for the estimator's fit method.
+
+ validation: tuple or None
+ Validation data of the form (X, y). If None, no validation
+ is performed.
+
+ trial: [Trial][] or None
+ Active trial (during hyperparameter tuning).
+
+ Returns
+ -------
+ Predictor
+ Fitted instance.
+
+ """
+ params = est_params_fit.copy()
+
+ callbacks = params.pop("callbacks", [])
+ if trial and len(self._metric) == 1 and not self._gpu:
+ callbacks.append(cb := CatBoostPruningCallback(trial, "CatBMetric"))
+
+ # gpu implementation fails if callbacks!=None
+ estimator.fit(*data, eval_set=validation, callbacks=callbacks or None, **params)
+
+ if not self._gpu:
+ if validation:
+ # Create evals attribute with train and validation scores
+ m = self._metric[0].name
+ evals = estimator.evals_result_
+ self._evals[f"{m}_train"] = evals["learn"]["CatBMetric"]
+ self._evals[f"{m}_test"] = evals["validation"]["CatBMetric"]
+
+ if trial and len(self._metric) == 1 and cb._pruned:
+ # Add the pruned step to the output
+ step = len(self.evals[f'{m}_train'])
+ steps = estimator.get_params()[self.has_validation]
+ trial.params[self.has_validation] = f"{step}/{steps}"
+
+ trial.set_user_attr("estimator", estimator)
+ raise TrialPruned(cb._message)
+
+ return estimator
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ n_estimators=Int(20, 500, step=10),
+ learning_rate=Float(0.01, 1.0, log=True),
+ max_depth=Cat([None, *range(1, 17)]),
+ min_child_samples=Int(1, 30),
+ bootstrap_type=Cat(["Bayesian", "Bernoulli"]),
+ bagging_temperature=Float(0, 10),
+ subsample=Float(0.5, 1.0, step=0.1),
+ reg_lambda=Float(0.001, 100, log=True),
+ )
+
+
+class CategoricalNB(ClassRegModel):
+ """Categorical Naive Bayes.
+
+ Categorical Naive Bayes implements the Naive Bayes algorithm for
+ categorical features.
+
+ Corresponding estimators are:
+
+ - [CategoricalNB][categoricalnbclass] for classification tasks.
+
+ Read more in sklearn's [documentation][catnbdocs].
+
+ See Also
+ --------
+ atom.models:BernoulliNB
+ atom.models:ComplementNB
+ atom.models:GaussianNB
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ import numpy as np
+
+ X = np.random.randint(5, size=(100, 100))
+ y = np.random.randint(2, size=100)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="CatNB", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "CatNB"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "cuml"]
+
+ _module = "naive_bayes"
+ _estimators = CustomDict({"class": "CategoricalNB"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ alpha=Float(0.01, 10, log=True),
+ fit_prior=Cat([True, False]),
+ )
+
+
+class ComplementNB(ClassRegModel):
+ """Complement Naive Bayes.
+
+ The Complement Naive Bayes classifier was designed to correct the
+ "severe assumptions" made by the standard [MultinomialNB][]
+ classifier. It is particularly suited for imbalanced datasets.
+
+ Corresponding estimators are:
+
+ - [ComplementNB][complementnbclass] for classification tasks.
+
+ Read more in sklearn's [documentation][cnbdocs].
+
+ See Also
+ --------
+ atom.models:BernoulliNB
+ atom.models:CategoricalNB
+ atom.models:MultinomialNB
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="CNB", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "CNB"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "cuml"]
+
+ _module = "naive_bayes"
+ _estimators = CustomDict({"class": "ComplementNB"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ alpha=Float(0.01, 10, log=True),
+ fit_prior=Cat([True, False]),
+ norm=Cat([True, False]),
+ )
+
+
+class DecisionTree(ClassRegModel):
+ """Single Decision Tree.
+
+ A single decision tree classifier/regressor.
+
+ Corresponding estimators are:
+
+ - [DecisionTreeClassifier][] for classification tasks.
+ - [DecisionTreeRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][treedocs].
+
+ See Also
+ --------
+ atom.models:ExtraTree
+ atom.models:ExtraTrees
+ atom.models:RandomForest
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="Tree", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "Tree"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = True
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "tree"
+ _estimators = CustomDict(
+ {"class": "DecisionTreeClassifier", "reg": "DecisionTreeRegressor"}
+ )
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ if self.goal == "class":
+ criterion = ["gini", "entropy"]
+ else:
+ criterion = ["squared_error", "absolute_error", "friedman_mse", "poisson"]
+
+ return CustomDict(
+ criterion=Cat(criterion),
+ splitter=Cat(["best", "random"]),
+ max_depth=Cat([None, *range(1, 17)]),
+ min_samples_split=Int(2, 20),
+ min_samples_leaf=Int(1, 20),
+ max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
+ ccp_alpha=Float(0, 0.035, step=0.005),
+ )
+
+
+class Dummy(ClassRegModel):
+ """Dummy classifier/regressor.
+
+ When doing supervised learning, a simple sanity check consists of
+ comparing one's estimator against simple rules of thumb. The
+ prediction methods completely ignore the input data. Do not use
+ this model for real problems. Use it only as a simple baseline
+ to compare with other models.
+
+ Corresponding estimators are:
+
+ - [DummyClassifier][] for classification tasks.
+ - [DummyRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][dummydocs].
+
+ See Also
+ --------
+ atom.models:DecisionTree
+ atom.models:ExtraTree
+ atom.models:NaiveForecaster
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="Dummy", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "Dummy"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "dummy"
+ _estimators = CustomDict({"class": "DummyClassifier", "reg": "DummyRegressor"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if self._get_param("strategy", params) != "quantile":
+ params.pop("quantile")
+
+ return params
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ dist = CustomDict()
+ if self.goal == "class":
+ dist["strategy"] = Cat(["most_frequent", "prior", "stratified", "uniform"])
+ else:
+ dist["strategy"] = Cat(["mean", "median", "quantile"])
+ dist["quantile"] = Float(0, 1.0, step=0.1)
+
+ return dist
+
+
+class ElasticNet(ClassRegModel):
+ """Linear Regression with elasticnet regularization.
+
+ Linear least squares with l1 and l2 regularization.
+
+ Corresponding estimators are:
+
+ - [ElasticNet][elasticnetreg] for regression tasks.
+
+ Read more in sklearn's [documentation][endocs].
+
+ See Also
+ --------
+ atom.models:Lasso
+ atom.models:OrdinaryLeastSquares
+ atom.models:Ridge
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import fetch_california_housing
+
+ X, y = fetch_california_housing(return_X_y=True)
+
+ atom = ATOMRegressor(X, y, random_state=1)
+ atom.run(models="EN", metric="r2", verbose=2)
+ ```
+
+ """
+
+ acronym = "EN"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "sklearnex", "cuml"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"reg": "ElasticNet"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ alpha=Float(1e-3, 10, log=True),
+ l1_ratio=Float(0.1, 0.9, step=0.1),
+ selection=Cat(["cyclic", "random"]),
+ )
+
+
+class ExtraTree(ClassRegModel):
+ """Extremely Randomized Tree.
+
+ Extra-trees differ from classic decision trees in the way they are
+ built. When looking for the best split to separate the samples of a
+ node into two groups, random splits are drawn for each of the
+ max_features randomly selected features and the best split among
+ those is chosen. When max_features is set 1, this amounts to
+ building a totally random decision tree.
+
+ Corresponding estimators are:
+
+ - [ExtraTreeClassifier][] for classification tasks.
+ - [ExtraTreeRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][treedocs].
+
+ See Also
+ --------
+ atom.models:DecisionTree
+ atom.models:ExtraTrees
+ atom.models:RandomForest
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="ETree", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "ETree"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = True
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "tree"
+ _estimators = CustomDict(
+ {"class": "ExtraTreeClassifier", "reg": "ExtraTreeRegressor"}
+ )
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if not self._get_param("bootstrap", params):
+ params.pop("max_samples")
+
+ return params
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ if self.goal == "class":
+ criterion = ["gini", "entropy"]
+ else:
+ criterion = ["squared_error", "absolute_error"]
+
+ return CustomDict(
+ criterion=Cat(criterion),
+ splitter=Cat(["random", "best"]),
+ max_depth=Cat([None, *range(1, 17)]),
+ min_samples_split=Int(2, 20),
+ min_samples_leaf=Int(1, 20),
+ max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
+ ccp_alpha=Float(0, 0.035, step=0.005),
+ )
+
+
+class ExtraTrees(ClassRegModel):
+ """Extremely Randomized Trees.
+
+ Extra-Trees use a meta estimator that fits a number of randomized
+ decision trees (a.k.a. [extra-trees][extratree]) on various
+ sub-samples of the dataset and uses averaging to improve the
+ predictive accuracy and control over-fitting.
+
+ Corresponding estimators are:
+
+ - [ExtraTreesClassifier][] for classification tasks.
+ - [ExtraTreesRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][etdocs].
+
+ See Also
+ --------
+ atom.models:DecisionTree
+ atom.models:ExtraTree
+ atom.models:RandomForest
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="ET", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "ET"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = True
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "ensemble"
+ _estimators = CustomDict(
+ {"class": "ExtraTreesClassifier", "reg": "ExtraTreesRegressor"}
+ )
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if not self._get_param("bootstrap", params):
+ params.pop("max_samples")
+
+ return params
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ if self.goal == "class":
+ criterion = ["gini", "entropy"]
+ else:
+ criterion = ["squared_error", "absolute_error"]
+
+ return CustomDict(
+ n_estimators=Int(10, 500, step=10),
+ criterion=Cat(criterion),
+ max_depth=Cat([None, *range(1, 17)]),
+ min_samples_split=Int(2, 20),
+ min_samples_leaf=Int(1, 20),
+ max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
+ bootstrap=Cat([True, False]),
+ max_samples=Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]),
+ ccp_alpha=Float(0, 0.035, step=0.005),
+ )
+
+
+class GaussianNB(ClassRegModel):
+ """Gaussian Naive Bayes.
+
+ Gaussian Naive Bayes implements the Naive Bayes algorithm for
+ classification. The likelihood of the features is assumed to
+ be Gaussian.
+
+ Corresponding estimators are:
+
+ - [GaussianNB][gaussiannbclass] for classification tasks.
+
+ Read more in sklearn's [documentation][gnbdocs].
+
+ See Also
+ --------
+ atom.models:BernoulliNB
+ atom.models:CategoricalNB
+ atom.models:ComplementNB
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="GNB", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "GNB"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "cuml"]
+
+ _module = "naive_bayes"
+ _estimators = CustomDict({"class": "GaussianNB"})
+
+
+class GaussianProcess(ClassRegModel):
+ """Gaussian process.
+
+ Gaussian Processes are a generic supervised learning method
+ designed to solve regression and probabilistic classification
+ problems. The advantages of Gaussian processes are:
+
+ * The prediction interpolates the observations.
+ * The prediction is probabilistic (Gaussian) so that one can compute
+ empirical confidence intervals and decide based on those if one
+ should refit (online fitting, adaptive fitting) the prediction in
+ some region of interest.
+
+ The disadvantages of Gaussian processes include:
+
+ * They are not sparse, i.e. they use the whole samples/features
+ information to perform the prediction.
+ * They lose efficiency in high dimensional spaces, namely when the
+ number of features exceeds a few dozens.
+
+ Corresponding estimators are:
+
+ - [GaussianProcessClassifier][] for classification tasks.
+ - [GaussianProcessRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][gpdocs].
+
+ See Also
+ --------
+ atom.models:GaussianNB
+ atom.models:LinearDiscriminantAnalysis
+ atom.models:PassiveAggressive
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="GP", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "GP"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "gaussian_process"
+ _estimators = CustomDict(
+ {"class": "GaussianProcessClassifier", "reg": "GaussianProcessRegressor"}
+ )
+
+
+class GradientBoostingMachine(ClassRegModel):
+ """Gradient Boosting Machine.
+
+ A Gradient Boosting Machine builds an additive model in a forward
+ stage-wise fashion; it allows for the optimization of arbitrary
+ differentiable loss functions. In each stage `n_classes_` regression
+ trees are fit on the negative gradient of the loss function, e.g.
+ binary or multiclass log loss. Binary classification is a special
+ case where only a single regression tree is induced.
+
+ Corresponding estimators are:
+
+ - [GradientBoostingClassifier][] for classification tasks.
+ - [GradientBoostingRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][gbmdocs].
+
+ !!! tip
+ [HistGradientBoosting][] is a much faster variant of this
+ algorithm for intermediate datasets (n_samples >= 10k).
+
+ See Also
+ --------
+ atom.models:CatBoost
+ atom.models:HistGradientBoosting
+ atom.models:LightGBM
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="GBM", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "GBM"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "ensemble"
+ _estimators = CustomDict(
+ {"class": "GradientBoostingClassifier", "reg": "GradientBoostingRegressor"}
+ )
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if self._get_param("loss", params) not in ("huber", "quantile"):
+ params.pop("alpha")
+
+ return params
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ dist = CustomDict(
+ loss=Cat(["log_loss", "exponential"]),
+ learning_rate=Float(0.01, 1.0, log=True),
+ n_estimators=Int(10, 500, step=10),
+ subsample=Float(0.5, 1.0, step=0.1),
+ criterion=Cat(["friedman_mse", "squared_error"]),
+ min_samples_split=Int(2, 20),
+ min_samples_leaf=Int(1, 20),
+ max_depth=Int(1, 21),
+ max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
+ ccp_alpha=Float(0, 0.035, step=0.005),
+ )
+
+ if self.task.startswith("multiclass"):
+ dist.pop("loss") # Multiclass only supports log_loss
+ elif self.goal.startswith("reg"):
+ dist["loss"] = Cat(["squared_error", "absolute_error", "huber", "quantile"])
+ dist["alpha"] = Float(0.1, 0.9, step=0.1)
+
+ return dist
+
+
+class HuberRegression(ClassRegModel):
+ """Huber regressor.
+
+ Huber is a linear regression model that is robust to outliers. It
+ makes sure that the loss function is not heavily influenced by the
+ outliers while not completely ignoring their effect.
+
+ Corresponding estimators are:
+
+ - [HuberRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][huberdocs].
+
+ See Also
+ --------
+ atom.models:AutomaticRelevanceDetermination
+ atom.models:LeastAngleRegression
+ atom.models:OrdinaryLeastSquares
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import fetch_california_housing
+
+ X, y = fetch_california_housing(return_X_y=True)
+
+ atom = ATOMRegressor(X, y, random_state=1)
+ atom.run(models="Huber", metric="r2", verbose=2)
+ ```
+
+ """
+
+ acronym = "Huber"
+ needs_scaling = True
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"reg": "HuberRegressor"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ epsilon=Float(1, 10, log=True),
+ max_iter=Int(50, 500, step=10),
+ alpha=Float(1e-4, 1, log=True),
+ )
+
+
+class HistGradientBoosting(ClassRegModel):
+ """Histogram-based Gradient Boosting Machine.
+
+ This Histogram-based Gradient Boosting Machine is much faster than
+ the standard [GradientBoostingMachine][] for big datasets
+ (n_samples>=10k). This variation first bins the input samples into
+ integer-valued bins which tremendously reduces the number of
+ splitting points to consider, and allows the algorithm to leverage
+ integer-based data structures (histograms) instead of relying on
+ sorted continuous values when building the trees.
+
+ Corresponding estimators are:
+
+ - [HistGradientBoostingClassifier][] for classification tasks.
+ - [HistGradientBoostingRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][hgbmdocs].
+
+ See Also
+ --------
+ atom.models:CatBoost
+ atom.models:GradientBoostingMachine
+ atom.models:XGBoost
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="hGBM", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "hGBM"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "ensemble"
+ _estimators = CustomDict(
+ {
+ "class": "HistGradientBoostingClassifier",
+ "reg": "HistGradientBoostingRegressor",
+ }
+ )
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ dist = CustomDict(
+ loss=Cat(["squared_error", "absolute_error", "poisson", "quantile", "gamma"]),
+ learning_rate=Float(0.01, 1.0, log=True),
+ max_iter=Int(10, 500, step=10),
+ max_leaf_nodes=Int(10, 50),
+ max_depth=Cat([None, *range(1, 17)]),
+ min_samples_leaf=Int(10, 30),
+ l2_regularization=Float(0, 1.0, step=0.1),
+ )
+
+ if self.goal == "class":
+ dist.pop("loss")
+
+ return dist
+
+
+class KNearestNeighbors(ClassRegModel):
+ """K-Nearest Neighbors.
+
+ K-Nearest Neighbors, as the name clearly indicates, implements the
+ k-nearest neighbors vote. For regression, the target is predicted
+ by local interpolation of the targets associated of the nearest
+ neighbors in the training set.
+
+ Corresponding estimators are:
+
+ - [KNeighborsClassifier][] for classification tasks.
+ - [KNeighborsRegressor][] for classification tasks.
+
+ Read more in sklearn's [documentation][knndocs].
+
+ See Also
+ --------
+ atom.models:LinearDiscriminantAnalysis
+ atom.models:QuadraticDiscriminantAnalysis
+ atom.models:RadiusNearestNeighbors
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="KNN", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "KNN"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = True
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sklearn", "sklearnex", "cuml"]
+
+ _module = "neighbors"
+ _estimators = CustomDict(
+ {"class": "KNeighborsClassifier", "reg": "KNeighborsRegressor"}
+ )
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ dist = CustomDict(
+ n_neighbors=Int(1, 100),
+ weights=Cat(["uniform", "distance"]),
+ algorithm=Cat(["auto", "ball_tree", "kd_tree", "brute"]),
+ leaf_size=Int(20, 40),
+ p=Int(1, 2),
+ )
+
+ if self._gpu:
+ dist.pop("algorithm") # Only 'brute' is supported
+ if self.engine.get("estimator") == "cuml":
+ dist.pop("weights") # Only 'uniform' is supported
+ dist.pop("leaf_size")
+ dist.pop("p")
+
+ return dist
+
+
+class Lasso(ClassRegModel):
+ """Linear Regression with lasso regularization.
+
+ Linear least squares with l1 regularization.
+
+ Corresponding estimators are:
+
+ - [Lasso][lassoreg] for regression tasks.
+
+ Read more in sklearn's [documentation][lassodocs].
+
+ See Also
+ --------
+ atom.models:ElasticNet
+ atom.models:OrdinaryLeastSquares
+ atom.models:Ridge
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import fetch_california_housing
+
+ X, y = fetch_california_housing(return_X_y=True)
+
+ atom = ATOMRegressor(X, y, random_state=1)
+ atom.run(models="Lasso", metric="r2", verbose=2)
+ ```
+
+ """
+
+ acronym = "Lasso"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "sklearnex", "cuml"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"reg": "Lasso"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ alpha=Float(1e-3, 10, log=True),
+ selection=Cat(["cyclic", "random"]),
+ )
+
+
+class LeastAngleRegression(ClassRegModel):
+ """Least Angle Regression.
+
+ Least-Angle Regression is a regression algorithm for
+ high-dimensional data. Lars is similar to forward stepwise
+ regression. At each step, it finds the feature most correlated
+ with the target. When there are multiple features having equal
+ correlation, instead of continuing along the same feature, it
+ proceeds in a direction equiangular between the features.
+
+ Corresponding estimators are:
+
+ - [Lars][] for regression tasks.
+
+ Read more in sklearn's [documentation][larsdocs].
+
+ See Also
+ --------
+ atom.models:BayesianRidge
+ atom.models:HuberRegression
+ atom.models:OrdinaryLeastSquares
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import fetch_california_housing
+
+ X, y = fetch_california_housing(return_X_y=True)
+
+ atom = ATOMRegressor(X, y, random_state=1)
+ atom.run(models="Lars", metric="r2", verbose=2)
+ ```
+
+ """
+
+ acronym = "Lars"
+ needs_scaling = True
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"reg": "Lars"})
+
+
+class LightGBM(ClassRegModel):
+ """Light Gradient Boosting Machine.
+
+ LightGBM is a gradient boosting model that uses tree based learning
+ algorithms. It is designed to be distributed and efficient with the
+ following advantages:
+
+ - Faster training speed and higher efficiency.
+ - Lower memory usage.
+ - Better accuracy.
+ - Capable of handling large-scale data.
+
+ Corresponding estimators are:
+
+ - [LGBMClassifier][] for classification tasks.
+ - [LGBMRegressor][] for regression tasks.
+
+ Read more in LightGBM's [documentation][lgbdocs].
+
+ !!! info
+ Using LightGBM's [GPU acceleration][estimator-acceleration]
+ requires [additional software dependencies][lgb_gpu].
+
+ See Also
+ --------
+ atom.models:CatBoost
+ atom.models:GradientBoostingMachine
+ atom.models:XGBoost
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="LGB", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "LGB"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = "n_estimators"
+ supports_engines = ["lightgbm"]
+
+ _module = "lightgbm.sklearn"
+ _estimators = CustomDict({"class": "LGBMClassifier", "reg": "LGBMRegressor"})
+
+ def _get_est(self, **params) -> PREDICTOR:
+ """Get the model's estimator with unpacked parameters.
+
+ Returns
+ -------
+ Predictor
+ Estimator instance.
+
+ """
+ # Custom lightgbm mapping for warnings
+ # PYTHONWARNINGS doesn't work since they go from C/C++ code to stdout
+ warns = dict(always=2, default=1, error=0, ignore=-1)
+
+ return self._est_class(
+ verbose=params.pop("verbose", warns.get(self.warnings, -1)),
+ n_jobs=params.pop("n_jobs", self.n_jobs),
+ device=params.pop("device", "gpu" if self._gpu else "cpu"),
+ gpu_device_id=params.pop("gpu_device_id", self._device_id or -1),
+ random_state=params.pop("random_state", self.random_state),
+ **params,
+ )
+
+ def _fit_estimator(
+ self,
+ estimator: PREDICTOR,
+ data: tuple[DATAFRAME, PANDAS],
+ est_params_fit: dict,
+ validation: tuple[DATAFRAME, PANDAS] | None = None,
+ trial: Trial | None = None,
+ ):
+ """Fit the estimator and perform in-training validation.
+
+ Parameters
+ ----------
+ estimator: Predictor
+ Instance to fit.
+
+ data: tuple
+ Training data of the form (X, y).
+
+ est_params_fit: dict
+ Additional parameters for the estimator's fit method.
+
+ validation: tuple or None
+ Validation data of the form (X, y). If None, no validation
+ is performed.
+
+ trial: [Trial][] or None
+ Active trial (during hyperparameter tuning).
+
+ Returns
+ -------
+ Predictor
+ Fitted instance.
+
+ """
+ from lightgbm.callback import log_evaluation
+
+ m = self._metric[0].name
+ params = est_params_fit.copy()
+
+ callbacks = params.pop("callbacks", []) + [log_evaluation(-1)]
+ if trial and len(self._metric) == 1:
+ callbacks.append(LightGBMPruningCallback(trial, m, "valid_1"))
+
+ eval_metric = None
+ if getattr(self, "_metric", None):
+ eval_metric = LGBMetric(self._metric[0], task=self.task)
+
+ try:
+ estimator.fit(
+ *data,
+ eval_set=[data, validation] if validation else None,
+ eval_metric=params.pop("eval_metric", eval_metric),
+ callbacks=callbacks,
+ **params,
+ )
+ except TrialPruned as ex:
+ # Add the pruned step to the output
+ step = str(ex).split(" ")[-1][:-1]
+ steps = estimator.get_params()[self.has_validation]
+ trial.params[self.has_validation] = f"{step}/{steps}"
+
+ trial.set_user_attr("estimator", estimator)
+ raise ex
+
+ if validation:
+ # Create evals attribute with train and validation scores
+ self._evals[f"{m}_train"] = estimator.evals_result_["training"][m]
+ self._evals[f"{m}_test"] = estimator.evals_result_["valid_1"][m]
+
+ return estimator
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ n_estimators=Int(20, 500, step=10),
+ learning_rate=Float(0.01, 1.0, log=True),
+ max_depth=Int(-1, 17, step=2),
+ num_leaves=Int(20, 40),
+ min_child_weight=Float(1e-4, 100, log=True),
+ min_child_samples=Int(1, 30),
+ subsample=Float(0.5, 1.0, step=0.1),
+ colsample_bytree=Float(0.4, 1.0, step=0.1),
+ reg_alpha=Float(1e-4, 100, log=True),
+ reg_lambda=Float(1e-4, 100, log=True),
+ )
+
+
+class LinearDiscriminantAnalysis(ClassRegModel):
+ """Linear Discriminant Analysis.
+
+ Linear Discriminant Analysis is a classifier with a linear
+ decision boundary, generated by fitting class conditional densities
+ to the data and using Bayes’ rule. The model fits a Gaussian
+ density to each class, assuming that all classes share the same
+ covariance matrix.
+
+ Corresponding estimators are:
+
+ - [LinearDiscriminantAnalysis][ldaclassifier] for classification tasks.
+
+ Read more in sklearn's [documentation][ldadocs].
+
+ See Also
+ --------
+ atom.models:LogisticRegression
+ atom.models:RadiusNearestNeighbors
+ atom.models:QuadraticDiscriminantAnalysis
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="LDA", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "LDA"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "discriminant_analysis"
+ _estimators = CustomDict({"class": "LinearDiscriminantAnalysis"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if self._get_param("solver", params) == "svd":
+ params.pop("shrinkage")
+
+ return params
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ solver=Cat(["svd", "lsqr", "eigen"]),
+ shrinkage=Cat([None, "auto", 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
+ )
+
+
+class LinearSVM(ClassRegModel):
+ """Linear Support Vector Machine.
+
+ Similar to [SupportVectorMachine][] but with a linear kernel.
+ Implemented in terms of liblinear rather than libsvm, so it has
+ more flexibility in the choice of penalties and loss functions and
+ should scale better to large numbers of samples.
+
+ Corresponding estimators are:
+
+ - [LinearSVC][] for classification tasks.
+ - [LinearSVR][] for classification tasks.
+
+ Read more in sklearn's [documentation][svmdocs].
+
+ See Also
+ --------
+ atom.models:KNearestNeighbors
+ atom.models:StochasticGradientDescent
+ atom.models:SupportVectorMachine
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="lSVM", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "lSVM"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "cuml"]
+
+ _module = "svm"
+ _estimators = CustomDict({"class": "LinearSVC", "reg": "LinearSVR"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if self.goal == "class":
+ if self._get_param("loss", params) == "hinge":
+ # l1 regularization can't be combined with hinge
+ params.replace_value("penalty", "l2")
+ # l2 regularization can't be combined with hinge when dual=False
+ params.replace_value("dual", True)
+ elif self._get_param("loss", params) == "squared_hinge":
+ # l1 regularization can't be combined with squared_hinge when dual=True
+ if self._get_param("penalty", params) == "l1":
+ params.replace_value("dual", False)
+ elif self._get_param("loss", params) == "epsilon_insensitive":
+ params.replace_value("dual", True)
+
+ return params
+
+ def _get_est(self, **params) -> PREDICTOR:
+ """Get the estimator instance.
+
+ Parameters
+ ----------
+ **params
+ Unpacked hyperparameters for the estimator.
+
+ Returns
+ -------
+ Predictor
+ Estimator instance.
+
+ """
+ if self.engine.get("estimator") == "cuml" and self.goal == "class":
+ return self._est_class(probability=params.pop("probability", True), **params)
+ else:
+ return super()._get_est(**params)
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ dist = CustomDict()
+ if self.goal == "class":
+ dist["penalty"] = Cat(["l1", "l2"])
+ dist["loss"] = Cat(["hinge", "squared_hinge"])
+ else:
+ dist["loss"] = Cat(["epsilon_insensitive", "squared_epsilon_insensitive"])
+
+ dist["C"] = Float(1e-3, 100, log=True)
+ dist["dual"] = Cat([True, False])
+
+ if self.engine.get("estimator") == "cuml":
+ dist.pop("dual")
+
+ return dist
+
+
+class LogisticRegression(ClassRegModel):
+ """Logistic Regression.
+
+ Logistic regression, despite its name, is a linear model for
+ classification rather than regression. Logistic regression is also
+ known in the literature as logit regression, maximum-entropy
+ classification (MaxEnt) or the log-linear classifier. In this model,
+ the probabilities describing the possible outcomes of a single trial
+ are modeled using a logistic function.
+
+ Corresponding estimators are:
+
+ - [LogisticRegression][] for classification tasks.
+
+ Read more in sklearn's [documentation][lrdocs].
+
+ See Also
+ --------
+ atom.models:GaussianProcess
+ atom.models:LinearDiscriminantAnalysis
+ atom.models:PassiveAggressive
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="RF", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "LR"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "sklearnex", "cuml"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"class": "LogisticRegression"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ # Limitations on penalty + solver combinations
+ penalty = self._get_param("penalty", params)
+ solver = self._get_param("solver", params)
+ cond_1 = penalty is None and solver == "liblinear"
+ cond_2 = penalty == "l1" and solver not in ("liblinear", "saga")
+ cond_3 = penalty == "elasticnet" and solver != "saga"
+
+ if cond_1 or cond_2 or cond_3:
+ params.replace_value("penalty", "l2") # Change to default value
+
+ if self._get_param("penalty", params) != "elasticnet":
+ params.pop("l1_ratio")
+
+ if self._get_param("penalty", params) is None:
+ params.pop("C")
+
+ return params
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ dist = CustomDict(
+ penalty=Cat([None, "l1", "l2", "elasticnet"]),
+ C=Float(1e-3, 100, log=True),
+ solver=Cat(["lbfgs", "newton-cg", "liblinear", "sag", "saga"]),
+ max_iter=Int(100, 1000, step=10),
+ l1_ratio=Float(0, 1.0, step=0.1),
+ )
+
+ if self._gpu:
+ dist.pop("solver")
+ dist.pop("penalty") # Only 'l2' is supported
+ elif self.engine.get("estimator") == "sklearnex":
+ dist["solver"] = Cat(["lbfgs", "newton-cg"])
+
+ return dist
+
+
+class MultiLayerPerceptron(ClassRegModel):
+ """Multi-layer Perceptron.
+
+ Multi-layer Perceptron is a supervised learning algorithm that
+ learns a function by training on a dataset. Given a set of features
+ and a target, it can learn a non-linear function approximator for
+ either classification or regression. It is different from logistic
+ regression, in that between the input and the output layer, there
+ can be one or more non-linear layers, called hidden layers.
+
+ Corresponding estimators are:
+
+ - [MLPClassifier][] for classification tasks.
+ - [MLPRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][mlpdocs].
+
+ See Also
+ --------
+ atom.models:PassiveAggressive
+ atom.models:Perceptron
+ atom.models:StochasticGradientDescent
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="MLP", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "MLP"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = True
+ native_multioutput = False
+ has_validation = "max_iter"
+ supports_engines = ["sklearn"]
+
+ _module = "neural_network"
+ _estimators = CustomDict({"class": "MLPClassifier", "reg": "MLPRegressor"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ # Drop layers when a previous layer has 0 neurons
+ drop = False
+ for param in [p for p in sorted(params) if p.startswith("hidden_layer")]:
+ if params[param] == 0 or drop:
+ drop = True
+ params.pop(param)
+
+ if self._get_param("solver", params) != "sgd":
+ params.pop("learning_rate")
+ params.pop("power_t")
+ else:
+ params.pop("learning_rate_init")
+
+ return params
+
+ def _trial_to_est(self, params: CustomDict) -> CustomDict:
+ """Convert trial's hyperparameters to parameters for the estimator.
+
+ Parameters
+ ----------
+ params: CustomDict
+ Trial's hyperparameters.
+
+ Returns
+ -------
+ CustomDict
+ Estimator's hyperparameters.
+
+ """
+ params = super()._trial_to_est(params)
+
+ hidden_layer_sizes = []
+ for param in [p for p in sorted(params) if p.startswith("hidden_layer")]:
+ hidden_layer_sizes.append(params.pop(param))
+
+ if hidden_layer_sizes:
+ params.insert(0, "hidden_layer_sizes", tuple(hidden_layer_sizes))
+
+ return params
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ dist = CustomDict(
+ hidden_layer_1=Int(10, 100),
+ hidden_layer_2=Int(0, 100),
+ hidden_layer_3=Int(0, 10),
+ activation=Cat(["identity", "logistic", "tanh", "relu"]),
+ solver=Cat(["lbfgs", "sgd", "adam"]),
+ alpha=Float(1e-4, 0.1, log=True),
+ batch_size=Cat(["auto", 8, 16, 32, 64, 128, 256]),
+ learning_rate=Cat(["constant", "invscaling", "adaptive"]),
+ learning_rate_init=Float(1e-3, 0.1, log=True),
+ power_t=Float(0.1, 0.9, step=0.1),
+ max_iter=Int(50, 500, step=10),
+ )
+
+ # Drop layers if sizes are specified by user
+ return dist[3:] if "hidden_layer_sizes" in self._est_params else dist
+
+
+class MultinomialNB(ClassRegModel):
+ """Multinomial Naive Bayes.
+
+ MultinomialNB implements the Naive Bayes algorithm for multinomially
+ distributed data, and is one of the two classic Naive Bayes variants
+ used in text classification (where the data are typically
+ represented as word vector counts, although tf-idf vectors are also
+ known to work well in practice).
+
+ Corresponding estimators are:
+
+ - [MultinomialNB][multinomialnbclass] for classification tasks.
+
+ Read more in sklearn's [documentation][mnbdocs].
+
+ See Also
+ --------
+ atom.models:BernoulliNB
+ atom.models:ComplementNB
+ atom.models:GaussianNB
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="MNB", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "MNB"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "cuml"]
+
+ _module = "naive_bayes"
+ _estimators = CustomDict({"class": "MultinomialNB"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ alpha=Float(0.01, 10, log=True),
+ fit_prior=Cat([True, False]),
+ )
+
+
+class OrdinaryLeastSquares(ClassRegModel):
+ """Linear Regression.
+
+ Ordinary Least Squares is just linear regression without any
+ regularization. It fits a linear model with coefficients `w=(w1,
+ ..., wp)` to minimize the residual sum of squares between the
+ observed targets in the dataset, and the targets predicted by the
+ linear approximation.
+
+ Corresponding estimators are:
+
+ - [LinearRegression][] for regression tasks.
+
+ Read more in sklearn's [documentation][olsdocs].
+
+ See Also
+ --------
+ atom.models:ElasticNet
+ atom.models:Lasso
+ atom.models:Ridge
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import fetch_california_housing
+
+ X, y = fetch_california_housing(return_X_y=True)
+
+ atom = ATOMRegressor(X, y, random_state=1)
+ atom.run(models="OLS", metric="r2", verbose=2)
+ ```
+
+ """
+
+ acronym = "OLS"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "sklearnex", "cuml"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"reg": "LinearRegression"})
+
+
+class OrthogonalMatchingPursuit(ClassRegModel):
+ """Orthogonal Matching Pursuit.
+
+ Orthogonal Matching Pursuit implements the OMP algorithm for
+ approximating the fit of a linear model with constraints imposed
+ on the number of non-zero coefficients.
+
+ Corresponding estimators are:
+
+ - [OrthogonalMatchingPursuit][] for regression tasks.
+
+ Read more in sklearn's [documentation][ompdocs].
+
+ See Also
+ --------
+ atom.models:Lasso
+ atom.models:LeastAngleRegression
+ atom.models:OrdinaryLeastSquares
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import fetch_california_housing
+
+ X, y = fetch_california_housing(return_X_y=True)
+
+ atom = ATOMRegressor(X, y, random_state=1)
+ atom.run(models="OMP", metric="r2", verbose=2)
+ ```
+
+ """
+
+ acronym = "OMP"
+ needs_scaling = True
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"reg": "OrthogonalMatchingPursuit"})
+
+
+class PassiveAggressive(ClassRegModel):
+ """Passive Aggressive.
+
+ The passive-aggressive algorithms are a family of algorithms for
+ large-scale learning. They are similar to the Perceptron in that
+ they do not require a learning rate. However, contrary to the
+ [Perceptron][], they include a regularization parameter `C`.
+
+ Corresponding estimators are:
+
+ - [PassiveAggressiveClassifier][] for classification tasks.
+ - [PassiveAggressiveRegressor][] for classification tasks.
+
+ Read more in sklearn's [documentation][padocs].
+
+ See Also
+ --------
+ atom.models:MultiLayerPerceptron
+ atom.models:Perceptron
+ atom.models:StochasticGradientDescent
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="PA", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "PA"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = "max_iter"
+ supports_engines = ["sklearn"]
+
+ _module = "linear_model"
+ _estimators = CustomDict(
+ {"class": "PassiveAggressiveClassifier", "reg": "PassiveAggressiveRegressor"}
+ )
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ if self.goal == "class":
+ loss = ["hinge", "squared_hinge"]
+ else:
+ loss = ["epsilon_insensitive", "squared_epsilon_insensitive"]
+
+ return CustomDict(
+ C=Float(1e-3, 100, log=True),
+ max_iter=Int(500, 1500, step=50),
+ loss=Cat(loss),
+ average=Cat([True, False]),
+ )
+
+
+class Perceptron(ClassRegModel):
+ """Linear Perceptron classification.
+
+ The Perceptron is a simple classification algorithm suitable for
+ large scale learning. By default:
+
+ * It does not require a learning rate.
+ * It is not regularized (penalized).
+ * It updates its model only on mistakes.
+
+ The last characteristic implies that the Perceptron is slightly
+ faster to train than [StochasticGradientDescent][] with the hinge
+ loss and that the resulting models are sparser.
+
+ Corresponding estimators are:
+
+ - [Perceptron][percclassifier] for classification tasks.
+
+ Read more in sklearn's [documentation][percdocs].
+
+ See Also
+ --------
+ atom.models:MultiLayerPerceptron
+ atom.models:PassiveAggressive
+ atom.models:StochasticGradientDescent
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="Perc", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "Perc"
+ needs_scaling = True
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = "max_iter"
+ supports_engines = ["sklearn"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"class": "Perceptron"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if self._get_param("penalty", params) != "elasticnet":
+ params.pop("l1_ratio")
+
+ return params
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ penalty=Cat([None, "l2", "l1", "elasticnet"]),
+ alpha=Float(1e-4, 10, log=True),
+ l1_ratio=Float(0.1, 0.9, step=0.1),
+ max_iter=Int(500, 1500, step=50),
+ eta0=Float(1e-2, 10, log=True),
+ )
+
+
+class QuadraticDiscriminantAnalysis(ClassRegModel):
+ """Quadratic Discriminant Analysis.
+
+ Quadratic Discriminant Analysis is a classifier with a quadratic
+ decision boundary, generated by fitting class conditional densities
+ to the data and using Bayes’ rule. The model fits a Gaussian
+ density to each class, assuming that all classes share the same
+ covariance matrix.
+
+ Corresponding estimators are:
+
+ - [QuadraticDiscriminantAnalysis][qdaclassifier] for classification tasks.
+
+ Read more in sklearn's [documentation][ldadocs].
+
+ See Also
+ --------
+ atom.models:LinearDiscriminantAnalysis
+ atom.models:LogisticRegression
+ atom.models:RadiusNearestNeighbors
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="QDA", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "QDA"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "discriminant_analysis"
+ _estimators = CustomDict({"class": "QuadraticDiscriminantAnalysis"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(reg_param=Float(0, 1.0, step=0.1))
+
+
+class RadiusNearestNeighbors(ClassRegModel):
+ """Radius Nearest Neighbors.
+
+ Radius Nearest Neighbors implements the nearest neighbors vote,
+ where the neighbors are selected from within a given radius. For
+ regression, the target is predicted by local interpolation of the
+ targets associated of the nearest neighbors in the training set.
+
+ !!! warning
+ * The `radius` parameter should be tuned to the data at hand or
+ the model will perform poorly.
+ * If outliers are detected, the estimator raises an exception
+ unless `est_params={"outlier_label": "most_frequent"}` is used.
+
+ Corresponding estimators are:
+
+ - [RadiusNeighborsClassifier][] for classification tasks.
+ - [RadiusNeighborsRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][knndocs].
+
+ See Also
+ --------
+ atom.models:KNearestNeighbors
+ atom.models:LinearDiscriminantAnalysis
+ atom.models:QuadraticDiscriminantAnalysis
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(
+ models="RNN",
+ metric="f1",
+ est_params={"outlier_label": "most_frequent"},
+ verbose=2,
+ )
+ ```
+
+ """
+
+ acronym = "RNN"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = True
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sklearn"]
+
+ _module = "neighbors"
+ _estimators = CustomDict(
+ {"class": "RadiusNeighborsClassifier", "reg": "RadiusNeighborsRegressor"}
+ )
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ radius=Float(1e-2, 100),
+ weights=Cat(["uniform", "distance"]),
+ algorithm=Cat(["auto", "ball_tree", "kd_tree", "brute"]),
+ leaf_size=Int(20, 40),
+ p=Int(1, 2),
+ )
+
+
+class RandomForest(ClassRegModel):
+ """Random Forest.
+
+ Random forests are an ensemble learning method that operate by
+ constructing a multitude of decision trees at training time and
+ outputting the class that is the mode of the classes
+ (classification) or mean prediction (regression) of the individual
+ trees. Random forests correct for decision trees' habit of
+ overfitting to their training set.
+
+ Corresponding estimators are:
+
+ - [RandomForestClassifier][] for classification tasks.
+ - [RandomForestRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][adabdocs].
+
+ !!! warning
+ cuML's implementation of [RandomForestClassifier][cumlrf] only
+ supports predictions on dtype `float32`. Convert all dtypes
+ before calling atom's [run][atomclassifier-run] method to avoid
+ exceptions.
+
+ See Also
+ --------
+ atom.models:DecisionTree
+ atom.models:ExtraTrees
+ atom.models:HistGradientBoosting
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="RF", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "RF"
+ needs_scaling = False
+ accepts_sparse = True
+ native_multilabel = True
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sklearn", "sklearnex", "cuml"]
+
+ _module = "ensemble"
+ _estimators = CustomDict(
+ {"class": "RandomForestClassifier", "reg": "RandomForestRegressor"}
+ )
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if not self._get_param("bootstrap", params):
+ params.pop("max_samples")
+
+ return params
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ if self.goal == "class":
+ criterion = ["gini", "entropy"]
+ else:
+ if self.engine.get("estimator") == "cuml":
+ criterion = ["mse", "poisson", "gamma", "inverse_gaussian"]
+ else:
+ criterion = ["squared_error", "absolute_error", "poisson"]
+
+ dist = CustomDict(
+ n_estimators=Int(10, 500, step=10),
+ criterion=Cat(criterion),
+ max_depth=Cat([None, *range(1, 17)]),
+ min_samples_split=Int(2, 20),
+ min_samples_leaf=Int(1, 20),
+ max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]),
+ bootstrap=Cat([True, False]),
+ max_samples=Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]),
+ ccp_alpha=Float(0, 0.035, step=0.005),
+ )
+
+ if self.engine.get("estimator") == "sklearnex":
+ dist.pop("criterion")
+ dist.pop("ccp_alpha")
+ elif self.engine.get("estimator") == "cuml":
+ dist.replace_key("criterion", "split_criterion")
+ dist["max_depth"] = Int(1, 17)
+ dist["max_features"] = Cat(["sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9])
+ dist["max_samples"] = Float(0.5, 0.9, step=0.1)
+ dist.pop("ccp_alpha")
+
+ return dist
+
+
+class Ridge(ClassRegModel):
+ """Linear least squares with l2 regularization.
+
+ If classifier, it first converts the target values into {-1, 1}
+ and then treats the problem as a regression task.
+
+ Corresponding estimators are:
+
+ - [RidgeClassifier][] for classification tasks.
+ - [Ridge][ridgeregressor] for regression tasks.
+
+ Read more in sklearn's [documentation][ridgedocs].
+
+ !!! warning
+ Engines `sklearnex` and `cuml` are only available for regression
+ tasks.
+
+ See Also
+ --------
+ atom.models:BayesianRidge
+ atom.models:ElasticNet
+ atom.models:Lasso
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import fetch_california_housing
+
+ X, y = fetch_california_housing(return_X_y=True)
+
+ atom = ATOMRegressor(X, y, random_state=1)
+ atom.run(models="Ridge", metric="r2", verbose=2)
+ ```
+
+ """
+
+ acronym = "Ridge"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = True
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "sklearnex", "cuml"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"class": "RidgeClassifier", "reg": "Ridge"})
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ dist = CustomDict(
+ alpha=Float(1e-3, 10, log=True),
+ solver=Cat(["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
+ )
+
+ if self.goal == "reg":
+ if self.engine.get("estimator") == "sklearnex":
+ dist.pop("solver") # Only supports 'auto'
+ elif self.engine.get("estimator") == "cuml":
+ dist["solver"] = Cat(["eig", "svd", "cd"])
+
+ return dist
+
+
+class StochasticGradientDescent(ClassRegModel):
+ """Stochastic Gradient Descent.
+
+ Stochastic Gradient Descent is a simple yet very efficient approach
+ to fitting linear classifiers and regressors under convex loss
+ functions. Even though SGD has been around in the machine learning
+ community for a long time, it has received a considerable amount of
+ attention just recently in the context of large-scale learning.
+
+ Corresponding estimators are:
+
+ - [SGDClassifier][] for classification tasks.
+ - [SGDRegressor][] for regression tasks.
+
+ Read more in sklearn's [documentation][sgddocs].
+
+ See Also
+ --------
+ atom.models:MultiLayerPerceptron
+ atom.models:PassiveAggressive
+ atom.models:SupportVectorMachine
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="SGD", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "SGD"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = "max_iter"
+ supports_engines = ["sklearn"]
+
+ _module = "linear_model"
+ _estimators = CustomDict({"class": "SGDClassifier", "reg": "SGDRegressor"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if self._get_param("penalty", params) != "elasticnet":
+ params.pop("l1_ratio")
+
+ if self._get_param("learning_rate", params) == "optimal":
+ params.pop("eta0")
+
+ return params
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ loss = [
+ "hinge",
+ "log_loss",
+ "modified_huber",
+ "squared_hinge",
+ "perceptron",
+ "squared_error",
+ "huber",
+ "epsilon_insensitive",
+ "squared_epsilon_insensitive",
+ ]
+
+ return CustomDict(
+ loss=Cat(loss if self.goal == "class" else loss[-4:]),
+ penalty=Cat([None, "l1", "l2", "elasticnet"]),
+ alpha=Float(1e-4, 1.0, log=True),
+ l1_ratio=Float(0.1, 0.9, step=0.1),
+ max_iter=Int(500, 1500, step=50),
+ epsilon=Float(1e-4, 1.0, log=True),
+ learning_rate=Cat(["constant", "invscaling", "optimal", "adaptive"]),
+ eta0=Float(1e-2, 10, log=True),
+ power_t=Float(0.1, 0.9, step=0.1),
+ average=Cat([True, False]),
+ )
+
+
+class SupportVectorMachine(ClassRegModel):
+ """Support Vector Machine.
+
+ The implementation of the Support Vector Machine is based on libsvm.
+ The fit time scales at least quadratically with the number of
+ samples and may be impractical beyond tens of thousands of samples.
+ For large datasets consider using a [LinearSVM][] or a
+ [StochasticGradientDescent][] model instead.
+
+ Corresponding estimators are:
+
+ - [SVC][] for classification tasks.
+ - [SVR][] for classification tasks.
+
+ Read more in sklearn's [documentation][svmdocs].
+
+ See Also
+ --------
+ atom.models:LinearSVM
+ atom.models:MultiLayerPerceptron
+ atom.models:StochasticGradientDescent
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="SVM", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "SVM"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = None
+ supports_engines = ["sklearn", "sklearnex", "cuml"]
+
+ _module = "svm"
+ _estimators = CustomDict({"class": "SVC", "reg": "SVR"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if self.goal == "class":
+ params.pop("epsilon")
+
+ kernel = self._get_param("kernel", params)
+ if kernel == "poly":
+ params.replace_value("gamma", "scale") # Crashes in combination with "auto"
+ else:
+ params.pop("degree")
+
+ if kernel not in ("rbf", "poly", "sigmoid"):
+ params.pop("gamma")
+
+ if kernel not in ("poly", "sigmoid"):
+ params.pop("coef0")
+
+ return params
+
+ def _get_est(self, **params) -> PREDICTOR:
+ """Get the model's estimator with unpacked parameters.
+
+ Returns
+ -------
+ Predictor
+ Estimator instance.
+
+ """
+ if self.engine.get("estimator") == "cuml" and self.goal == "class":
+ return self._est_class(
+ probability=params.pop("probability", True),
+ random_state=params.pop("random_state", self.random_state),
+ **params)
+ else:
+ return super()._get_est(**params)
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ dist = CustomDict(
+ C=Float(1e-3, 100, log=True),
+ kernel=Cat(["linear", "poly", "rbf", "sigmoid"]),
+ degree=Int(2, 5),
+ gamma=Cat(["scale", "auto"]),
+ coef0=Float(-1.0, 1.0),
+ epsilon=Float(1e-3, 100, log=True),
+ shrinking=Cat([True, False]),
+ )
+
+ if self.engine.get("estimator") == "cuml":
+ dist.pop("epsilon")
+ dist.pop("shrinking")
+
+ return dist
+
+
+class XGBoost(ClassRegModel):
+ """Extreme Gradient Boosting.
+
+ XGBoost is an optimized distributed gradient boosting model
+ designed to be highly efficient, flexible and portable. XGBoost
+ provides a parallel tree boosting that solve many data science
+ problems in a fast and accurate way.
+
+ Corresponding estimators are:
+
+ - [XGBClassifier][] for classification tasks.
+ - [XGBRegressor][] for regression tasks.
+
+ Read more in XGBoost's [documentation][xgbdocs].
+
+ See Also
+ --------
+ atom.models:CatBoost
+ atom.models:GradientBoostingMachine
+ atom.models:LightGBM
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(models="XGB", metric="f1", verbose=2)
+ ```
+
+ """
+
+ acronym = "XGB"
+ needs_scaling = True
+ accepts_sparse = True
+ native_multilabel = False
+ native_multioutput = False
+ has_validation = "n_estimators"
+ supports_engines = ["xgboost"]
+
+ _module = "xgboost"
+ _estimators = CustomDict({"class": "XGBClassifier", "reg": "XGBRegressor"})
+
+ def _get_est(self, **params) -> PREDICTOR:
+ """Get the model's estimator with unpacked parameters.
+
+ Returns
+ -------
+ Predictor
+ Estimator instance.
+
+ """
+ eval_metric = None
+ if getattr(self, "_metric", None):
+ eval_metric = XGBMetric(self._metric[0], task=self.task)
+
+ return self._est_class(
+ eval_metric=params.pop("eval_metric", eval_metric),
+ n_jobs=params.pop("n_jobs", self.n_jobs),
+ tree_method=params.pop("tree_method", "gpu_hist" if self._gpu else None),
+ gpu_id=self._device_id,
+ verbosity=params.pop("verbosity", 0),
+ random_state=params.pop("random_state", self.random_state),
+ **params,
+ )
+
+ def _fit_estimator(
+ self,
+ estimator: PREDICTOR,
+ data: tuple[DATAFRAME, PANDAS],
+ est_params_fit: dict,
+ validation: tuple[DATAFRAME, PANDAS] | None = None,
+ trial: Trial | None = None,
+ ):
+ """Fit the estimator and perform in-training validation.
+
+ Parameters
+ ----------
+ estimator: Predictor
+ Instance to fit.
+
+ data: tuple
+ Training data of the form (X, y).
+
+ est_params_fit: dict
+ Additional parameters for the estimator's fit method.
+
+ validation: tuple or None
+ Validation data of the form (X, y). If None, no validation
+ is performed.
+
+ trial: [Trial][] or None
+ Active trial (during hyperparameter tuning).
+
+ Returns
+ -------
+ Predictor
+ Fitted instance.
+
+ """
+ m = self._metric[0].name
+ params = est_params_fit.copy()
+
+ callbacks = params.pop("callbacks", [])
+ if trial and len(self._metric) == 1:
+ callbacks.append(XGBoostPruningCallback(trial, f"validation_1-{m}"))
+
+ try:
+ estimator.set_params(callbacks=callbacks)
+ estimator.fit(
+ *data,
+ eval_set=[data, validation] if validation else None,
+ verbose=params.get("verbose", False),
+ **params,
+ )
+ except TrialPruned as ex:
+ # Add the pruned step to the output
+ step = str(ex).split(" ")[-1][:-1]
+ steps = estimator.get_params()[self.has_validation]
+ trial.params[self.has_validation] = f"{step}/{steps}"
+
+ trial.set_user_attr("estimator", estimator)
+ raise ex
+
+ if validation:
+ # Create evals attribute with train and validation scores
+ # Negative because minimizes the function
+ results = estimator.evals_result()
+ self._evals[f"{m}_train"] = np.negative(results["validation_0"][m])
+ self._evals[f"{m}_test"] = np.negative(results["validation_1"][m])
+
+ return estimator
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ n_estimators=Int(20, 500, step=10),
+ learning_rate=Float(0.01, 1.0, log=True),
+ max_depth=Int(1, 20),
+ gamma=Float(0, 1.0),
+ min_child_weight=Int(1, 10),
+ subsample=Float(0.5, 1.0, step=0.1),
+ colsample_bytree=Float(0.4, 1.0, step=0.1),
+ reg_alpha=Float(1e-4, 100, log=True),
+ reg_lambda=Float(1e-4, 100, log=True),
+ )
diff --git a/atom/models/ensembles.py b/atom/models/ensembles.py
new file mode 100644
index 000000000..39d890983
--- /dev/null
+++ b/atom/models/ensembles.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module containing all ensemble models.
+
+"""
+
+from __future__ import annotations
+
+from atom.basemodel import ClassRegModel
+from atom.pipeline import Pipeline
+from atom.utils.types import PREDICTOR
+from atom.utils.utils import ClassMap, CustomDict, sign
+
+
+class Stacking(ClassRegModel):
+ """Stacking ensemble.
+
+ Parameters
+ ----------
+ models: ClassMap
+ Models from which to build the ensemble.
+
+ **kwargs
+ Additional keyword arguments for the estimator.
+
+ """
+
+ acronym = "Stack"
+ needs_scaling = False
+ has_validation = None
+ native_multilabel = False
+ native_multioutput = False
+ supports_engines = []
+
+ _module = "atom.ensembles"
+ _estimators = CustomDict({"class": "StackingClassifier", "reg": "StackingRegressor"})
+
+ def __init__(self, models: ClassMap, **kwargs):
+ self._models = models
+ kw_model = {k: v for k, v in kwargs.items() if k in sign(ClassRegModel.__init__)}
+ super().__init__(**kw_model)
+ self._est_params = {k: v for k, v in kwargs.items() if k not in kw_model}
+
+ def _get_est(self, **params) -> PREDICTOR:
+ """Get the model's estimator with unpacked parameters.
+
+ Returns
+ -------
+ Predictor
+ Estimator instance.
+
+ """
+ estimators = []
+ for m in self._models:
+ if m.scaler:
+ name = f"pipeline_{m.name}"
+ est = Pipeline([("scaler", m.scaler), (m.name, m.estimator)])
+ else:
+ name = m.name
+ est = m.estimator
+
+ estimators.append((name, est))
+
+ return self._est_class(
+ estimators=estimators,
+ n_jobs=params.pop("n_jobs", self.n_jobs),
+ **params,
+ )
+
+
+class Voting(ClassRegModel):
+ """Voting ensemble.
+
+ Parameters
+ ----------
+ models: ClassMap
+ Models from which to build the ensemble.
+
+ **kwargs
+ Additional keyword arguments for the estimator.
+
+ """
+
+ acronym = "Vote"
+ needs_scaling = False
+ has_validation = None
+ native_multilabel = False
+ native_multioutput = False
+ supports_engines = []
+
+ _module = "atom.ensembles"
+ _estimators = CustomDict({"class": "VotingClassifier", "reg": "VotingRegressor"})
+
+ def __init__(self, models: ClassMap, **kwargs):
+ self._models = models
+ kw_model = {k: v for k, v in kwargs.items() if k in sign(ClassRegModel.__init__)}
+ super().__init__(**kw_model)
+ self._est_params = {k: v for k, v in kwargs.items() if k not in kw_model}
+
+ if self._est_params.get("voting") == "soft":
+ for m in self._models:
+ if not hasattr(m.estimator, "predict_proba"):
+ raise ValueError(
+ "Invalid value for the voting parameter. If "
+ "'soft', all models in the ensemble should have "
+ f"a predict_proba method, got {m._fullname}."
+ )
+
+ def _get_est(self, **params) -> PREDICTOR:
+ """Get the model's estimator with unpacked parameters.
+
+ Returns
+ -------
+ Predictor
+ Estimator instance.
+
+ """
+ estimators = []
+ for m in self._models:
+ if m.scaler:
+ name = f"pipeline_{m.name}"
+ est = Pipeline([("scaler", m.scaler), (m.name, m.estimator)])
+ else:
+ name = m.name
+ est = m.estimator
+
+ estimators.append((name, est))
+
+ return self._est_class(
+ estimators=estimators,
+ n_jobs=params.pop("n_jobs", self.n_jobs),
+ **params,
+ )
diff --git a/atom/models/ts.py b/atom/models/ts.py
new file mode 100644
index 000000000..b3680a95a
--- /dev/null
+++ b/atom/models/ts.py
@@ -0,0 +1,535 @@
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module containing all time series models.
+
+"""
+
+from __future__ import annotations
+
+from optuna.distributions import CategoricalDistribution as Cat
+from optuna.distributions import IntDistribution as Int
+from optuna.trial import Trial
+
+from atom.basemodel import ForecastModel
+from atom.utils.utils import CustomDict
+
+
+class ARIMA(ForecastModel):
+ """Autoregressive Integrated Moving Average Model.
+
+ Seasonal ARIMA models and exogeneous input is supported, hence this
+ estimator is capable of fitting SARIMA, ARIMAX, and SARIMAX.
+
+ An ARIMA model, is a generalization of an autoregressive moving
+ average (ARMA) model, and is fitted to time-series data in an effort
+ to forecast future points. ARIMA models can be especially
+ efficacious in cases where data shows evidence of non-stationarity.
+
+ The "AR" part of ARIMA indicates that the evolving variable of
+ interest is regressed on its own lagged (i.e., prior observed)
+ values. The "MA" part indicates that the regression error is
+ actually a linear combination of error terms whose values occurred
+ contemporaneously and at various times in the past. The "I" (for
+ "integrated") indicates that the data values have been replaced with
+ the difference between their values and the previous values (and this
+ differencing process may have been performed more than once).
+
+ Corresponding estimators are:
+
+ - [ARIMA][arimaclass] for forecasting tasks.
+
+ !!! warning
+ ARIMA often runs into numerical errors when optimizing the
+ hyperparameters. Possible solutions are:
+
+ - Use the [AutoARIMA][] model instead.
+ - Use [`est_params`][directforecaster-est_params] to specify the
+ orders manually, e.g. `#!python atom.run("arima", n_trials=5,
+ est_params={"order": (1, 1, 0)})`.
+ - Use the `catch` parameter in [`ht_params`][directforecaster-ht_params]
+ to avoid raising every exception, e.g. `#!python atom.run("arima",
+ n_trials=5, ht_params={"catch": (Exception,)})`.
+
+ See Also
+ --------
+ atom.models:AutoARIMA
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMForecaster
+ from sktime.datasets import load_longley
+
+ _, X = load_longley()
+
+ atom = ATOMForecaster(X)
+ atom.run(models="ARIMA", verbose=2)
+ ```
+
+ """
+
+ acronym = "ARIMA"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sktime"]
+
+ _module = "sktime.forecasting.arima"
+ _estimators = CustomDict({"fc": "ARIMA"})
+
+ _order = ("p", "d", "q")
+ _sorder = ("Ps", "Ds", "Qs", "S")
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ # If no seasonal periodicity, set seasonal components to zero
+ if self._get_param("S", params) == 0:
+ for p in self._sorder:
+ params.replace_value(p, 0)
+
+ return params
+
+ def _trial_to_est(self, params: CustomDict) -> CustomDict:
+ """Convert trial's hyperparameters to parameters for the estimator.
+
+ Parameters
+ ----------
+ params: CustomDict
+ Trial's hyperparameters.
+
+ Returns
+ -------
+ CustomDict
+ Estimator's hyperparameters.
+
+ """
+ params = super()._trial_to_est(params)
+
+ # Convert params to hyperparameters order and seasonal_order
+ if all(p in params for p in self._sorder):
+ params.insert(0, "seasonal_order", tuple(params.pop(p) for p in self._sorder))
+ if all(p in params for p in self._order):
+ params.insert(0, "order", tuple(params.pop(p) for p in self._order))
+
+ return params
+
+ def _get_distributions(self) -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ methods = ["newton", "nm", "bfgs", "lbfgs", "powell", "cg", "ncg", "basinhopping"]
+
+ dist = CustomDict(
+ p=Int(0, 2),
+ d=Int(0, 1),
+ q=Int(0, 2),
+ Ps=Int(0, 2),
+ Ds=Int(0, 1),
+ Qs=Int(0, 2),
+ S=Cat([0, 4, 6, 7, 12]),
+ method=Cat(methods),
+ maxiter=Int(50, 200, step=10),
+ with_intercept=Cat([True, False]),
+ )
+
+ # Drop order and seasonal_order params if specified by user
+ if "order" in self._est_params:
+ for p in self._order:
+ dist.pop(p)
+ if "seasonal_order" in self._est_params:
+ for p in self._sorder:
+ dist.pop(p)
+
+ return dist
+
+
+class AutoARIMA(ForecastModel):
+ """Automatic Autoregressive Integrated Moving Average Model.
+
+ [ARIMA][] implementation that includes automated fitting of
+ (S)ARIMA(X) hyperparameters (p, d, q, P, D, Q). The AutoARIMA
+ algorithm seeks to identify the most optimal parameters for an
+ ARIMA model, settling on a single fitted ARIMA model. This process
+ is based on the commonly-used R function.
+
+ AutoARIMA works by conducting differencing tests (i.e.,
+ Kwiatkowski–Phillips–Schmidt–Shin, Augmented Dickey-Fuller or
+ Phillips–Perron) to determine the order of differencing, d, and
+ then fitting models within defined ranges. AutoARIMA also seeks
+ to identify the optimal P and Q hyperparameters after conducting
+ the Canova-Hansen to determine the optimal order of seasonal
+ differencing.
+
+ Note that due to stationarity issues, AutoARIMA might not find a
+ suitable model that will converge. If this is the case, a ValueError
+ is thrown suggesting stationarity-inducing measures be taken prior
+ to re-fitting or that a new range of order values be selected.
+
+ Corresponding estimators are:
+
+ - [AutoARIMA][autoarimaclass] for forecasting tasks.
+
+ See Also
+ --------
+ atom.models:ARIMA
+ atom.models:ETS
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMForecaster
+ from sktime.datasets import load_longley
+
+ _, X = load_longley()
+
+ atom = ATOMForecaster(X, random_state=1)
+ atom.run(models="autoarima", verbose=2)
+ ```
+
+ """
+
+ acronym = "AutoARIMA"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sktime"]
+
+ _module = "sktime.forecasting.arima"
+ _estimators = CustomDict({"fc": "AutoARIMA"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ methods = ["newton", "nm", "bfgs", "lbfgs", "powell", "cg", "ncg", "basinhopping"]
+
+ return CustomDict(
+ method=Cat(methods),
+ maxiter=Int(50, 200, step=10),
+ with_intercept=Cat([True, False]),
+ )
+
+
+class ExponentialSmoothing(ForecastModel):
+ """Exponential Smoothing forecaster.
+
+ Holt-Winters exponential smoothing forecaster. The default settings
+ use simple exponential smoothing, without trend and seasonality
+ components.
+
+ Corresponding estimators are:
+
+ - [ExponentialSmoothing][esclass] for forecasting tasks.
+
+ See Also
+ --------
+ atom.models:ARIMA
+ atom.models:ETS
+ atom.models:PolynomialTrend
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMForecaster
+ from sktime.datasets import load_airline
+
+ y = load_airline()
+
+ atom = ATOMForecaster(y, random_state=1)
+ atom.run(models="ES", verbose=2)
+ ```
+
+ """
+
+ acronym = "ES"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sktime"]
+
+ _module = "sktime.forecasting.exp_smoothing"
+ _estimators = CustomDict({"fc": "ExponentialSmoothing"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ if self._get_param("trend", params) is None:
+ params.pop("damped_trend")
+
+ if self._get_param("sp", params) is None:
+ params.pop("seasonal")
+
+ return params
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ methods = ["L-BFGS-B", "TNC", "SLSQP", "Powell", "trust-constr", "bh", "ls"]
+
+ return CustomDict(
+ trend=Cat(["add", "mul", None]),
+ damped_trend=Cat([True, False]),
+ seasonal=Cat(["add", "mul", None]),
+ sp=Cat([4, 6, 7, 12, None]),
+ use_boxcox=Cat([True, False]),
+ initialization_method=Cat(["estimated", "heuristic"]),
+ method=Cat(methods),
+ )
+
+
+class ETS(ForecastModel):
+ """ETS model with automatic fitting capabilities.
+
+ The ETS models are a family of time series models with an
+ underlying state space model consisting of a level component,
+ a trend component (T), a seasonal component (S), and an error
+ term (E).
+
+ Corresponding estimators are:
+
+ - [AutoETS][] for forecasting tasks.
+
+ See Also
+ --------
+ atom.models:ARIMA
+ atom.models:ExponentialSmoothing
+ atom.models:PolynomialTrend
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMForecaster
+ from sktime.datasets import load_airline
+
+ y = load_airline()
+
+ atom = ATOMForecaster(y, random_state=1)
+ atom.run(models="ETS", verbose=2)
+
+ ```
+
+ """
+
+ acronym = "ETS"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sktime"]
+
+ _module = "sktime.forecasting.ets"
+ _estimators = CustomDict({"fc": "AutoETS"})
+
+ def _get_parameters(self, trial: Trial) -> CustomDict:
+ """Get the trial's hyperparameters.
+
+ Parameters
+ ----------
+ trial: [Trial][]
+ Current trial.
+
+ Returns
+ -------
+ CustomDict
+ Trial's hyperparameters.
+
+ """
+ params = super()._get_parameters(trial)
+
+ # If no seasonal periodicity, set seasonal components to zero
+ if self._get_param("sp", params) == 1:
+ params.pop("seasonal")
+
+ return params
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ error=Cat(["add", "mul"]),
+ trend=Cat(["add", "mul", None]),
+ damped_trend=Cat([True, False]),
+ seasonal=Cat(["add", "mul", None]),
+ sp=Cat([1, 4, 6, 7, 12]),
+ initialization_method=Cat(["estimated", "heuristic"]),
+ maxiter=Int(500, 2000, step=100),
+ auto=Cat([True, False]),
+ information_criterion=Cat(["aic", "bic", "aicc"]),
+ )
+
+
+class NaiveForecaster(ForecastModel):
+ """Naive Forecaster.
+
+ NaiveForecaster is a dummy forecaster that makes forecasts using
+ simple strategies based on naive assumptions about past trends
+ continuing. When used in [multivariate][] tasks, each column is
+ forecasted with the same strategy.
+
+ Corresponding estimators are:
+
+ - [NaiveForecaster][naiveforecasterclass] for forecasting tasks.
+
+ See Also
+ --------
+ atom.models:ExponentialSmoothing
+ atom.models:Dummy
+ atom.models:PolynomialTrend
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMForecaster
+ from sktime.datasets import load_airline
+
+ y = load_airline()
+
+ atom = ATOMForecaster(y, random_state=1)
+ atom.run(models="NF", verbose=2)
+
+ ```
+
+ """
+
+ acronym = "NF"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sktime"]
+
+ _module = "sktime.forecasting.naive"
+ _estimators = CustomDict({"fc": "NaiveForecaster"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(strategy=Cat(["last", "mean", "drift"]))
+
+
+class PolynomialTrend(ForecastModel):
+ """Polynomial Trend forecaster.
+
+ Forecast time series data with a polynomial trend, using a sklearn
+ [LinearRegression][] class to regress values of time series on
+ index, after extraction of polynomial features.
+
+ Corresponding estimators are:
+
+ - [PolynomialTrendForecaster][] for forecasting tasks.
+
+ See Also
+ --------
+ atom.models:ARIMA
+ atom.models:ETS
+ atom.models:NaiveForecaster
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMForecaster
+ from sktime.datasets import load_airline
+
+ y = load_airline()
+
+ atom = ATOMForecaster(y, random_state=1)
+ atom.run(models="PT", verbose=2)
+ ```
+
+ """
+
+ acronym = "PT"
+ needs_scaling = False
+ accepts_sparse = False
+ native_multilabel = False
+ native_multioutput = True
+ has_validation = None
+ supports_engines = ["sktime"]
+
+ _module = "sktime.forecasting.trend"
+ _estimators = CustomDict({"fc": "PolynomialTrendForecaster"})
+
+ @staticmethod
+ def _get_distributions() -> CustomDict:
+ """Get the predefined hyperparameter distributions.
+
+ Returns
+ -------
+ CustomDict
+ Hyperparameter distributions.
+
+ """
+ return CustomDict(
+ degree=Int(1, 5),
+ with_intercept=Cat([True, False]),
+ )
diff --git a/atom/nlp.py b/atom/nlp.py
index ee3f79a07..5d43a9e14 100644
--- a/atom/nlp.py
+++ b/atom/nlp.py
@@ -949,7 +949,7 @@ class Vectorizer(BaseEstimator, TransformerMixin, BaseTransformer):
def __init__(
self,
- strategy: str = "bow",
+ strategy: Literal["bow", "tfidf", "hashing"] = "bow",
*,
return_sparse: BOOL = True,
device: str = "cpu",
@@ -1001,17 +1001,11 @@ def fit(self, X: FEATURES, y: TARGET | None = None) -> Vectorizer:
hashing="HashingVectorizer",
)
- if self.strategy in strategies:
- estimator = self._get_est_class(
- name=strategies[self.strategy],
- module="feature_extraction.text",
- )
- self._estimator = estimator(**self.kwargs)
- else:
- raise ValueError(
- "Invalid value for the strategy parameter, got "
- f"{self.strategy}. Choose from: {', '.join(strategies)}."
- )
+ estimator = self._get_est_class(
+ name=strategies[self.strategy],
+ module="feature_extraction.text",
+ )
+ self._estimator = estimator(**self.kwargs)
self.log("Fitting Vectorizer...", 1)
self._estimator.fit(X[corpus])
diff --git a/atom/pipeline.py b/atom/pipeline.py
index 9c9fcd2e6..3f68174cb 100644
--- a/atom/pipeline.py
+++ b/atom/pipeline.py
@@ -22,7 +22,8 @@
from typeguard import typechecked
from atom.utils.types import (
- BOOL, DATAFRAME, ESTIMATOR, FEATURES, FLOAT, SEQUENCE, SERIES, TARGET, INT
+ BOOL, DATAFRAME, ESTIMATOR, FEATURES, FLOAT, INT, PANDAS, SEQUENCE, SERIES,
+ TARGET,
)
from atom.utils.utils import (
check_is_fitted, fit_one, fit_transform_one, transform_one,
@@ -261,7 +262,7 @@ def transform(
self,
X: FEATURES | None = None,
y: TARGET | None = None,
- ) -> DATAFRAME | SERIES | tuple[DATAFRAME, SERIES]:
+ ) -> DATAFRAME | SERIES | tuple[DATAFRAME, PANDAS]:
"""Transform the data.
Call `transform` on each transformer in the pipeline. The
@@ -304,7 +305,7 @@ def fit_transform(
X: FEATURES | None = None,
y: TARGET | None = None,
**fit_params,
- ) -> DATAFRAME | SERIES | tuple[DATAFRAME, SERIES]:
+ ) -> DATAFRAME | SERIES | tuple[DATAFRAME, PANDAS]:
"""Fit the pipeline and transform the data.
Parameters
@@ -314,13 +315,15 @@ def fit_transform(
X is ignored. None
if the estimator only uses y.
- y: int, str, dict, sequence or None, default=None
+ y: int, str, dict, sequence, dataframe or None, default=None
Target column corresponding to X.
- If None: y is ignored.
- If int: Position of the target column in X.
- If str: Name of the target column in X.
- - Else: Array with shape=(n_samples,) to use as target.
+ - If sequence: Target array with shape=(n_samples,) or
+ sequence of column names or positions for multioutput tasks.
+ - If dataframe: Target columns for multioutput tasks.
**fit_params
Additional keyword arguments for the fit method.
@@ -330,7 +333,7 @@ def fit_transform(
dataframe
Transformed feature set. Only returned if provided.
- series
+ series or dataframe
Transformed target column. Only returned if provided.
"""
@@ -352,7 +355,7 @@ def inverse_transform(
self,
X: FEATURES | None = None,
y: TARGET | None = None,
- ) -> DATAFRAME | SERIES | tuple[DATAFRAME, SERIES]:
+ ) -> DATAFRAME | SERIES | tuple[DATAFRAME, PANDAS]:
"""Inverse transform for each step in a reverse order.
All estimators in the pipeline must implement the
@@ -364,20 +367,22 @@ def inverse_transform(
Feature set with shape=(n_samples, n_features). If None,
X is ignored. None if the pipeline only uses y.
- y: int, str, dict, sequence or None, default=None
+ y: int, str, dict, sequence, dataframe or None, default=None
Target column corresponding to X.
- If None: y is ignored.
- If int: Position of the target column in X.
- If str: Name of the target column in X.
- - Else: Array with shape=(n_samples,) to use as target.
+ - If sequence: Target array with shape=(n_samples,) or
+ sequence of column names or positions for multioutput tasks.
+ - If dataframe: Target columns for multioutput tasks.
Returns
-------
dataframe
Transformed feature set. Only returned if provided.
- series
+ series or dataframe
Transformed target column. Only returned if provided.
"""
diff --git a/atom/plots.py b/atom/plots.py
deleted file mode 100644
index 5da24fdfb..000000000
--- a/atom/plots.py
+++ /dev/null
@@ -1,8289 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-Automated Tool for Optimized Modelling (ATOM)
-Author: Mavs
-Description: Module containing the plotting classes.
-
-"""
-
-from __future__ import annotations
-
-from collections import defaultdict
-from contextlib import contextmanager
-from dataclasses import dataclass
-from datetime import datetime
-from functools import reduce
-from importlib.util import find_spec
-from itertools import chain, cycle
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import plotly.express as px
-import plotly.graph_objects as go
-import shap
-from joblib import Parallel, delayed
-from mlflow.tracking import MlflowClient
-from nltk.collocations import (
- BigramCollocationFinder, QuadgramCollocationFinder,
- TrigramCollocationFinder,
-)
-from optuna.importance import FanovaImportanceEvaluator
-from optuna.trial import TrialState
-from optuna.visualization._parallel_coordinate import (
- _get_dims_from_info, _get_parallel_coordinate_info,
-)
-from optuna.visualization._terminator_improvement import _get_improvement_info
-from optuna.visualization._utils import _is_log_scale
-from plotly.colors import unconvert_from_RGB_255, unlabel_rgb
-from scipy import stats
-from scipy.stats.mstats import mquantiles
-from sklearn.calibration import calibration_curve
-from sklearn.inspection import partial_dependence, permutation_importance
-from sklearn.metrics import (
- confusion_matrix, det_curve, precision_recall_curve, roc_curve,
-)
-from sklearn.utils import _safe_indexing
-from sklearn.utils._bunch import Bunch
-from sklearn.utils.metaestimators import available_if
-from sktime.forecasting.base import ForecastingHorizon
-from typeguard import typechecked
-
-from atom.utils.constants import PALETTE
-from atom.utils.types import (
- BOOL, DATAFRAME, FEATURES, FLOAT, INDEX, INT, INT_TYPES, METRIC_SELECTOR,
- MODEL, SCALAR, SEQUENCE, SERIES, SLICE,
-)
-from atom.utils.utils import (
- bk, check_canvas, check_dependency, check_hyperparams, check_predict_proba,
- composed, crash, divide, get_best_score, get_corpus, get_custom_scorer,
- has_attr, has_task, is_binary, is_multioutput, it, lst, plot_from_model,
- rnd, to_rgb,
-)
-
-
-@dataclass
-class Aesthetics:
- """Keeps track of plot aesthetics."""
-
- palette: SEQUENCE # Sequence of colors
- title_fontsize: INT # Fontsize for titles
- label_fontsize: INT # Fontsize for labels, legend and hoverinfo
- tick_fontsize: INT # Fontsize for ticks
- line_width: INT # Width of the line plots
- marker_size: INT # Size of the markers
-
-
-@typechecked
-class BaseFigure:
- """Base plotly figure.
-
- The instance stores the position of the current axes in grid,
- as well as the models used for the plot (to track in mlflow).
-
- Parameters
- ----------
- rows: int, default=1
- Number of subplot rows in the canvas.
-
- cols: int, default=1
- Number of subplot columns in the canvas.
-
- horizontal_spacing: float, default=0.05
- Space between subplot rows in normalized plot coordinates.
- The spacing is relative to the figure's size.
-
- vertical_spacing: float, default=0.07
- Space between subplot cols in normalized plot coordinates.
- The spacing is relative to the figure's size.
-
- palette: str or sequence, default="Prism"
- Name or color sequence for the palette.
-
- is_canvas: bool, default=False
- Whether the figure shows multiple plots.
-
- backend: str, default="plotly"
- Figure's backend. Choose between plotly or matplotlib.
-
- create_figure: bool, default=True
- Whether to create a new figure.
-
- """
-
- _marker = ["circle", "x", "diamond", "pentagon", "star", "hexagon"]
- _dash = [None, "dashdot", "dash", "dot", "longdash", "longdashdot"]
- _shape = ["", "/", "x", "\\", "-", "|", "+", "."]
-
- def __init__(
- self,
- rows: INT = 1,
- cols: INT = 1,
- horizontal_spacing: FLOAT = 0.05,
- vertical_spacing: FLOAT = 0.07,
- palette: str | SEQUENCE = "Prism",
- is_canvas: BOOL = False,
- backend: str = "plotly",
- create_figure: BOOL = True,
- ):
- self.rows = rows
- self.cols = cols
- self.horizontal_spacing = horizontal_spacing
- self.vertical_spacing = vertical_spacing
- if isinstance(palette, str):
- self._palette = getattr(px.colors.qualitative, palette)
- self.palette = cycle(self._palette)
- else:
- # Convert color names or hex to rgb
- self._palette = list(map(to_rgb, palette))
- self.palette = cycle(self._palette)
- self.is_canvas = is_canvas
- self.backend = backend
- self.create_figure = create_figure
-
- self.idx = 0 # N-th plot in the canvas
- self.axes = 0 # N-th axis in the canvas
- if self.create_figure:
- if self.backend == "plotly":
- self.figure = go.Figure()
- else:
- self.figure, _ = plt.subplots()
-
- self.groups = []
- self.style = dict(palette={}, marker={}, dash={}, shape={})
- self.marker = cycle(self._marker)
- self.dash = cycle(self._dash)
- self.shape = cycle(self._shape)
-
- self.pos = {} # Subplot position to use for title
- self.custom_layout = {} # Layout params specified by user
- self.used_models = [] # Models plotted in this figure
-
- # Perform parameter checks
- if not 0 < horizontal_spacing < 1:
- raise ValueError(
- "Invalid value for the horizontal_spacing parameter. The "
- f"value must lie between 0 and 1, got {horizontal_spacing}."
- )
-
- if not 0 < vertical_spacing < 1:
- raise ValueError(
- "Invalid value for the vertical_spacing parameter. The "
- f"value must lie between 0 and 1, got {vertical_spacing}."
- )
-
- @property
- def grid(self) -> tuple[INT, INT]:
- """Position of the current axes on the grid.
-
- Returns
- -------
- int
- X-position.
-
- int
- Y-position.
-
- """
- return (self.idx - 1) // self.cols + 1, self.idx % self.cols or self.cols
-
- @property
- def next_subplot(self) -> go.Figure | plt.Figure | None:
- """Increase the subplot index.
-
- Returns
- -------
- go.Figure, plt.Figure or None
- Current figure. Returns None if `create_figure=False`.
-
- """
- # Check if there are too many plots in the canvas
- if self.idx >= self.rows * self.cols:
- raise ValueError(
- "Invalid number of plots in the canvas! Increase "
- "the number of rows and cols to add more plots."
- )
- else:
- self.idx += 1
-
- if self.create_figure:
- return self.figure
-
- def get_elem(self, name: SCALAR | str | None = None, element: str = "palette") -> str:
- """Get the plot element for a specific name.
-
- This method is used to assign the same element (color, marker,
- etc...) to the same columns and models in a plot.
-
- Parameters
- ----------
- name: int, float or str or None
- Name for which to get the plot element. The name is stored in
- the element attributes to assign the same element to all calls
- with the same name.
-
- element: str, default="palette"
- Plot element to get. Choose from: palette, marker, dash, shape.
-
- Returns
- -------
- str
- Element code.
-
- """
- if name is None:
- return getattr(self, f"_{element}")[0] # Get first element (default)
- elif name in self.style[element]:
- return self.style[element][name]
- else:
- return self.style[element].setdefault(name, next(getattr(self, element)))
-
- def showlegend(self, name: str, legend: str | dict | None) -> BOOL:
- """Get whether the trace should be showed in the legend.
-
- If there's already a trace with the same name, it's not
- necessary to show it in the plot's legend.
-
- Parameters
- ----------
- name: str
- Name of the trace.
-
- legend: str, dict or None
- Legend parameter.
-
- Returns
- -------
- bool
- Whether the trace should be placed in the legend.
-
- """
- if name in self.groups:
- return False
- else:
- self.groups.append(name)
- return legend is not None
-
- def get_axes(
- self,
- x: tuple[INT, INT] = (0, 1),
- y: tuple[INT, INT] = (0, 1),
- coloraxis: dict | None = None,
- ) -> tuple[str, str]:
- """Create and update the plot's axes.
-
- Parameters
- ----------
- x: tuple of int
- Relative x-size of the plot.
-
- y: tuple of int
- Relative y-size of the plot.
-
- coloraxis: dict or None
- Properties of the coloraxis to create. None to ignore.
-
- Returns
- -------
- str
- Name of the x-axis.
-
- str
- Name of the y-axis.
-
- """
- self.axes += 1
-
- # Calculate the distance between subplots
- x_offset = divide(self.horizontal_spacing, (self.cols - 1))
- y_offset = divide(self.vertical_spacing, (self.rows - 1))
-
- # Calculate the size of the subplot
- x_size = (1 - ((x_offset * 2) * (self.cols - 1))) / self.cols
- y_size = (1 - ((y_offset * 2) * (self.rows - 1))) / self.rows
-
- # Calculate the size of the axes
- ax_size = (x[1] - x[0]) * x_size
- ay_size = (y[1] - y[0]) * y_size
-
- # Determine the position for the axes
- x_pos = (self.grid[1] - 1) * (x_size + 2 * x_offset) + x[0] * x_size
- y_pos = (self.rows - self.grid[0]) * (y_size + 2 * y_offset) + y[0] * y_size
-
- # Store positions for subplot title
- self.pos[str(self.axes)] = (x_pos + ax_size / 2, rnd(y_pos + ay_size))
-
- # Update the figure with the new axes
- self.figure.update_layout(
- {
- f"xaxis{self.axes}": dict(
- domain=(x_pos, rnd(x_pos + ax_size)), anchor=f"y{self.axes}"
- ),
- f"yaxis{self.axes}": dict(
- domain=(y_pos, rnd(y_pos + ay_size)), anchor=f"x{self.axes}"
- ),
- }
- )
-
- # Place a colorbar right of the axes
- if coloraxis:
- if title := coloraxis.pop("title", None):
- coloraxis["colorbar_title"] = dict(
- text=title, side="right", font_size=coloraxis.pop("font_size")
- )
-
- coloraxis["colorbar_x"] = rnd(x_pos + ax_size) + ax_size / 40
- coloraxis["colorbar_xanchor"] = "left"
- coloraxis["colorbar_y"] = y_pos + ay_size / 2
- coloraxis["colorbar_yanchor"] = "middle"
- coloraxis["colorbar_len"] = ay_size * 0.9
- coloraxis["colorbar_thickness"] = ax_size * 30 # Default width in pixels
- self.figure.update_layout(
- {f"coloraxis{coloraxis.pop('axes', self.axes)}": coloraxis}
- )
-
- xaxis = f"x{self.axes if self.axes > 1 else ''}"
- yaxis = f"y{self.axes if self.axes > 1 else ''}"
- return xaxis, yaxis
-
-
-@typechecked
-class BasePlot:
- """Base class for all plotting methods.
-
- This base class defines the properties that can be changed
- to customize the plot's aesthetics.
-
- """
-
- _fig = None
- _custom_layout = {}
- _custom_traces = {}
- _aesthetics = Aesthetics(
- palette=list(PALETTE),
- title_fontsize=24,
- label_fontsize=16,
- tick_fontsize=12,
- line_width=2,
- marker_size=8,
- )
-
- # Properties =================================================== >>
-
- @property
- def aesthetics(self) -> dict:
- """All plot aesthetic attributes."""
- return self._aesthetics
-
- @aesthetics.setter
- def aesthetics(self, value: dict):
- self.palette = value.get("palette", self.palette)
- self.title_fontsize = value.get("title_fontsize", self.title_fontsize)
- self.label_fontsize = value.get("label_fontsize", self.label_fontsize)
- self.tick_fontsize = value.get("tick_fontsize", self.tick_fontsize)
- self.line_width = value.get("line_width", self.line_width)
- self.marker_size = value.get("marker_size", self.marker_size)
-
- @property
- def palette(self) -> str | SEQUENCE:
- """Color palette.
-
- Specify one of plotly's [built-in palettes][palette] or create
- a custom one, e.g. `atom.palette = ["red", "green", "blue"]`.
-
- """
- return self._aesthetics.palette
-
- @palette.setter
- def palette(self, value: str | SEQUENCE):
- if isinstance(value, str) and not hasattr(px.colors.qualitative, value):
- raise ValueError(
- f"Invalid value for the palette parameter, got {value}. Choose "
- f"from one of plotly's built-in qualitative color sequences in "
- f"the px.colors.qualitative module or define your own sequence."
- )
-
- self._aesthetics.palette = value
-
- @property
- def title_fontsize(self) -> INT:
- """Fontsize for the plot's title."""
- return self._aesthetics.title_fontsize
-
- @title_fontsize.setter
- def title_fontsize(self, value: INT):
- if value <= 0:
- raise ValueError(
- "Invalid value for the title_fontsize parameter. "
- f"Value should be >=0, got {value}."
- )
-
- self._aesthetics.title_fontsize = value
-
- @property
- def label_fontsize(self) -> INT:
- """Fontsize for the labels, legend and hover information."""
- return self._aesthetics.label_fontsize
-
- @label_fontsize.setter
- def label_fontsize(self, value: INT):
- if value <= 0:
- raise ValueError(
- "Invalid value for the label_fontsize parameter. "
- f"Value should be >=0, got {value}."
- )
-
- self._aesthetics.label_fontsize = value
-
- @property
- def tick_fontsize(self) -> INT:
- """Fontsize for the ticks along the plot's axes."""
- return self._aesthetics.tick_fontsize
-
- @tick_fontsize.setter
- def tick_fontsize(self, value: INT):
- if value <= 0:
- raise ValueError(
- "Invalid value for the tick_fontsize parameter. "
- f"Value should be >=0, got {value}."
- )
-
- self._aesthetics.tick_fontsize = value
-
- @property
- def line_width(self) -> INT:
- """Width of the line plots."""
- return self._aesthetics.line_width
-
- @line_width.setter
- def line_width(self, value: INT):
- if value <= 0:
- raise ValueError(
- "Invalid value for the line_width parameter. "
- f"Value should be >=0, got {value}."
- )
-
- self._aesthetics.line_width = value
-
- @property
- def marker_size(self) -> INT:
- """Size of the markers."""
- return self._aesthetics.marker_size
-
- @marker_size.setter
- def marker_size(self, value: INT):
- if value <= 0:
- raise ValueError(
- "Invalid value for the marker_size parameter. "
- f"Value should be >=0, got {value}."
- )
-
- self._aesthetics.marker_size = value
-
- # Methods ====================================================== >>
-
- @staticmethod
- def _get_plot_index(df: DATAFRAME) -> INDEX:
- """Return the dataset's index in a plottable format.
-
- Plotly does not accept all index formats (e.g. pd.Period),
- thus use this utility method to convert to timestamp those
- indices that can, else return as is.
-
- Parameters
- ----------
- df: dataframe
- Data set to get the index from.
-
- Returns
- -------
- index
- Index in an acceptable format.
-
- """
- if hasattr(df.index, "to_timestamp"):
- return df.index.to_timestamp()
- else:
- return df.index
-
- @staticmethod
- def _get_show(show: INT | None, model: MODEL | list[MODEL]) -> INT:
- """Check and return the number of features to show.
-
- Parameters
- ----------
- show: int or None
- Number of features to show. If None, select all (max 200).
-
- model: Model or list
- Models from which to get the features.
-
- Returns
- -------
- int
- Number of features to show.
-
- """
- max_fxs = max(m.n_features for m in lst(model))
- if show is None or show > max_fxs:
- # Limit max features shown to avoid maximum figsize error
- show = min(200, max_fxs)
- elif show < 1:
- raise ValueError(
- f"Invalid value for the show parameter. Value should be >0, got {show}."
- )
-
- return show
-
- @staticmethod
- def _get_hyperparams(
- params: str | slice | SEQUENCE | None,
- model: MODEL,
- ) -> list[str]:
- """Check and return a model's hyperparameters.
-
- Parameters
- ----------
- params: str, slice, sequence or None
- Hyperparameters to get. Use a sequence or add `+` between
- options to select more than one. If None, all the model's
- hyperparameters are selcted.
-
- model: Model
- Get the params from this model.
-
- Returns
- -------
- list of str
- Selected hyperparameters.
-
- """
- if params is None:
- hyperparameters = list(model._ht["distributions"])
- elif isinstance(params, slice):
- hyperparameters = list(model._ht["distributions"])[params]
- else:
- hyperparameters = []
- for param in lst(params):
- if isinstance(param, INT_TYPES):
- hyperparameters.append(list(model._ht["distributions"])[param])
- elif isinstance(param, str):
- for p in param.split("+"):
- if p not in model._ht["distributions"]:
- raise ValueError(
- "Invalid value for the params parameter. "
- f"Hyperparameter {p} was not used during the "
- f"optimization of model {model.name}."
- )
- else:
- hyperparameters.append(p)
-
- if not hyperparameters:
- raise ValueError(f"Didn't find any hyperparameters for model {model.name}.")
-
- return hyperparameters
-
- def _get_metric(
- self,
- metric: INT | str | SEQUENCE,
- max_one: BOOL,
- ) -> INT | str | list[INT]:
- """Check and return the provided metric index.
-
- Parameters
- ----------
- metric: int, str, sequence or None
- Metric to retrieve. If None, all metrics are returned.
-
- max_one: bool
- Whether one or multiple metrics are allowed.
-
- Returns
- -------
- int or list
- Position index of the metric. If `max_one=False`, returns
- a list of metric positions.
-
- """
- if metric is None:
- return list(range(len(self._metric)))
- else:
- inc = []
- for met in lst(metric):
- if isinstance(met, INT_TYPES):
- if 0 <= met < len(self._metric):
- inc.append(met)
- else:
- raise ValueError(
- f"Invalid value for the metric parameter. Value {met} is out "
- f"of range for a pipeline with {len(self._metric)} metrics."
- )
- elif isinstance(met, str):
- met = met.lower()
- for m in met.split("+"):
- if m in ("time_ht", "time_fit", "time_bootstrap", "time"):
- inc.append(m)
- elif (name := get_custom_scorer(m).name) in self.metric:
- inc.append(self._metric.index(name))
- else:
- raise ValueError(
- "Invalid value for the metric parameter. The "
- f"{name} metric wasn't used to fit the models."
- )
-
- if len(inc) > 1 and max_one:
- raise ValueError(
- "Invalid value for the metric parameter. "
- f"Only one metric is allowed, got {inc}."
- )
-
- return inc[0] if max_one else inc
-
- def _get_set(
- self,
- dataset: str | SEQUENCE,
- max_one: BOOL,
- allow_holdout: BOOL = True,
- ) -> str | list[str]:
- """Check and return the provided data set.
-
- Parameters
- ----------
- dataset: str or sequence
- Name(s) of the data set to retrieve.
-
- max_one: bool
- Whether one or multiple data sets are allowed. If True, return
- the data set instead of a list.
-
- allow_holdout: bool, default=True
- Whether to allow the retrieval of the holdout set.
-
- Returns
- -------
- str or list
- Selected data set(s).
-
- """
- for ds in (dataset := "+".join(lst(dataset)).lower().split("+")):
- if ds == "holdout":
- if allow_holdout:
- if self.holdout is None:
- raise ValueError(
- "Invalid value for the dataset parameter. No holdout "
- "data set was specified when initializing the instance."
- )
- else:
- raise ValueError(
- "Invalid value for the dataset parameter, got "
- f"{ds}. Choose from: train, test."
- )
- elif ds not in ("train", "test"):
- raise ValueError(
- "Invalid value for the dataset parameter, got {ds}. "
- f"Choose from: train, test{', holdout' if allow_holdout else ''}."
- )
-
- if max_one and len(dataset) > 1:
- raise ValueError(
- "Invalid value for the dataset parameter, got "
- f"{dataset}. Only one data set is allowed."
- )
-
- return dataset[0] if max_one else dataset
-
- def _get_figure(self, **kwargs) -> go.Figure | plt.Figure:
- """Return existing figure if in canvas, else a new figure.
-
- Every time this method is called from a canvas, the plot
- index is raised by one to keep track in which subplot the
- BaseFigure is at.
-
- Parameters
- ----------
- **kwargs
- Additional keyword arguments for BaseFigure.
-
- Returns
- -------
- [go.Figure][] or [plt.Figure][]
- Existing figure or newly created.
-
- """
- if BasePlot._fig and BasePlot._fig.is_canvas:
- return BasePlot._fig.next_subplot
- else:
- BasePlot._fig = BaseFigure(palette=self.palette, **kwargs)
- return BasePlot._fig.next_subplot
-
- def _draw_line(
- self,
- parent: str,
- child: str | None = None,
- legend: str | dict = None,
- **kwargs,
- ) -> go.Scatter:
- """Draw a line.
-
- Unify the style to draw a line, where parent and child
- (e.g. model - data set or column - distribution) keep the
- same style (color or dash). A legendgroup title is only added
- when there is a child element.
-
- Parameters
- ----------
- parent: str
- Name of the model.
-
- child: str or None, default=None
- Data set which is plotted.
-
- legend: str, dict or None
- Legend argument provided by the user.
-
- **kwargs
- Additional keyword arguments for the trace.
-
- Returns
- -------
- go.Scatter
- New trace to add to figure.
-
- """
- legendgrouptitle = dict(text=parent, font_size=self.label_fontsize)
- hover = f"(%{{x}}, %{{y}}){parent}{f' - {child}' if child else ''}"
- return go.Scatter(
- line=dict(
- width=self.line_width,
- color=BasePlot._fig.get_elem(parent),
- dash=BasePlot._fig.get_elem(child, "dash"),
- ),
- marker=dict(
- symbol=BasePlot._fig.get_elem(child, "marker"),
- size=self.marker_size,
- color=BasePlot._fig.get_elem(parent),
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- ),
- hovertemplate=kwargs.pop("hovertemplate", hover),
- name=kwargs.pop("name", child or parent),
- legendgroup=kwargs.pop("legendgroup", parent),
- legendgrouptitle=legendgrouptitle if child else None,
- showlegend=BasePlot._fig.showlegend(f"{parent}-{child}", legend),
- **kwargs,
- )
-
- @staticmethod
- def _draw_straight_line(y: SCALAR | str, xaxis: str, yaxis: str):
- """Draw a line across the axis.
-
- The line can be either horizontal or diagonal. The line should
- be used as reference. It's not added to the legend and doesn't
- show any information on hover.
-
- Parameters
- ----------
- y: int, float or str, default = "diagonal"
- Coordinates on the y-axis. If a value, draw a horizontal line
- at that value. If "diagonal", draw a diagonal line from x.
-
- xaxis: str
- Name of the x-axis to draw in.
-
- yaxis: str
- Name of the y-axis to draw in.
-
- """
- BasePlot._fig.figure.add_shape(
- type="line",
- x0=0,
- x1=1,
- y0=0 if y == "diagonal" else y,
- y1=1 if y == "diagonal" else y,
- xref=f"{xaxis} domain",
- yref=f"{yaxis} domain" if y == "diagonal" else yaxis,
- line=dict(width=1, color="black", dash="dash"),
- opacity=0.6,
- layer="below",
- )
-
- def _plot(
- self,
- fig: go.Figure | plt.Figure | None = None,
- ax: plt.Axes | tuple[str, str] | None = None,
- **kwargs,
- ) -> go.Figure | plt.Figure | None:
- """Make the plot.
-
- Customize the axes to the default layout and plot the figure
- if it's not part of a canvas.
-
- Parameters
- ----------
- fig: go.Figure, plt.Figure or None
- Current figure. If None, use `plt.gcf()`.
-
- ax: plt.Axes, tuple or None, default=None
- Axis object or names of the axes to update. If None, ignore
- their update.
-
- **kwargs
- Keyword arguments containing the figure's parameters.
-
- - title: Name of the title or custom configuration.
- - legend: Whether to show the legend or custom configuration.
- - xlabel: Label for the x-axis.
- - ylabel: Label for the y-axis.
- - xlim: Limits for the x-axis.
- - ylim: Limits for the y-axis.
- - figsize: Size of the figure.
- - filename: Name of the saved file.
- - plotname: Name of the plot.
- - display: Whether to show the plot. If None, return the figure.
-
- Returns
- -------
- plt.Figure, go.Figure or None
- Created figure. Only returned if `display=None`.
-
- """
- # Set name with which to save the file
- if kwargs.get("filename"):
- if kwargs["filename"].endswith("auto"):
- name = kwargs["filename"].replace("auto", kwargs["plotname"])
- else:
- name = kwargs["filename"]
- else:
- name = kwargs.get("plotname")
-
- fig = fig or BasePlot._fig.figure
- if BasePlot._fig.backend == "plotly":
- if ax:
- fig.update_layout(
- {
- f"{ax[0]}_title": dict(
- text=kwargs.get("xlabel"), font_size=self.label_fontsize
- ),
- f"{ax[1]}_title": dict(
- text=kwargs.get("ylabel"), font_size=self.label_fontsize
- ),
- f"{ax[0]}_range": kwargs.get("xlim"),
- f"{ax[1]}_range": kwargs.get("ylim"),
- f"{ax[0]}_automargin": True,
- f"{ax[1]}_automargin": True,
- }
- )
-
- if BasePlot._fig.is_canvas and (title := kwargs.get("title")):
- # Add a subtitle to a plot in the canvas
- default_title = {
- "x": BasePlot._fig.pos[ax[0][5:] or "1"][0],
- "y": BasePlot._fig.pos[ax[0][5:] or "1"][1] + 0.005,
- "xref": "paper",
- "yref": "paper",
- "xanchor": "center",
- "yanchor": "bottom",
- "showarrow": False,
- "font_size": self.title_fontsize - 4,
- }
-
- if isinstance(title, dict):
- title = {**default_title, **title}
- else:
- title = {"text": title, **default_title}
-
- fig.update_layout(dict(annotations=fig.layout.annotations + (title,)))
-
- if not BasePlot._fig.is_canvas and kwargs.get("plotname"):
- default_title = dict(
- x=0.5,
- y=1,
- pad=dict(t=15, b=15),
- xanchor="center",
- yanchor="top",
- xref="paper",
- font_size=self.title_fontsize,
- )
- if isinstance(title := kwargs.get("title"), dict):
- title = {**default_title, **title}
- else:
- title = {"text": title, **default_title}
-
- default_legend = dict(
- traceorder="grouped",
- groupclick=kwargs.get("groupclick", "toggleitem"),
- font_size=self.label_fontsize,
- bgcolor="rgba(255, 255, 255, 0.5)",
- )
- if isinstance(legend := kwargs.get("legend"), str):
- position = {}
- legend = legend.lower()
- if legend == "upper left":
- position = dict(x=0.01, y=0.99, xanchor="left", yanchor="top")
- elif legend == "lower left":
- position = dict(x=0.01, y=0.01, xanchor="left", yanchor="bottom")
- elif legend == "upper right":
- position = dict(x=0.99, y=0.99, xanchor="right", yanchor="top")
- elif legend == "lower right":
- position = dict(x=0.99, y=0.01, xanchor="right", yanchor="bottom")
- elif legend == "upper center":
- position = dict(x=0.5, y=0.99, xanchor="center", yanchor="top")
- elif legend == "lower center":
- position = dict(x=0.5, y=0.01, xanchor="center", yanchor="bottom")
- elif legend == "center left":
- position = dict(x=0.01, y=0.5, xanchor="left", yanchor="middle")
- elif legend == "center right":
- position = dict(x=0.99, y=0.5, xanchor="right", yanchor="middle")
- elif legend == "center":
- position = dict(x=0.5, y=0.5, xanchor="center", yanchor="middle")
- elif legend != "out":
- raise ValueError(
- "Invalid value for the legend parameter. Got unknown "
- f"position: {legend}. Choose from: upper left, upper "
- "right, lower left, lower right, upper center, lower "
- "center, center left, center right, center, out."
- )
- legend = {**default_legend, **position}
- elif isinstance(legend, dict):
- legend = {**default_legend, **legend}
-
- # Update layout with predefined settings
- space1 = self.title_fontsize if title.get("text") else 10
- space2 = self.title_fontsize * int(bool(fig.layout.annotations))
- fig.update_layout(
- title=title,
- legend=legend,
- showlegend=bool(kwargs.get("legend")),
- hoverlabel=dict(font_size=self.label_fontsize),
- font_size=self.tick_fontsize,
- margin=dict(l=50, b=50, r=0, t=25 + space1 + space2, pad=0),
- width=kwargs["figsize"][0],
- height=kwargs["figsize"][1],
- )
-
- # Update plot with custom settings
- fig.update_traces(**self._custom_traces)
- fig.update_layout(**self._custom_layout)
-
- if kwargs.get("filename"):
- if "." not in name or name.endswith(".html"):
- fig.write_html(name if "." in name else name + ".html")
- else:
- fig.write_image(name)
-
- # Log plot to mlflow run of every model visualized
- if getattr(self, "experiment", None) and self.log_plots:
- for m in set(BasePlot._fig.used_models):
- MlflowClient().log_figure(
- run_id=m._run.info.run_id,
- figure=fig,
- artifact_file=name if "." in name else f"{name}.html",
- )
-
- if kwargs.get("display") is True:
- fig.show()
- elif kwargs.get("display") is None:
- return fig
-
- else:
- if kwargs.get("title"):
- ax.set_title(kwargs.get("title"), fontsize=self.title_fontsize, pad=20)
- if kwargs.get("xlabel"):
- ax.set_xlabel(kwargs["xlabel"], fontsize=self.label_fontsize, labelpad=12)
- if kwargs.get("ylabel"):
- ax.set_ylabel(kwargs["ylabel"], fontsize=self.label_fontsize, labelpad=12)
- if ax is not None:
- ax.tick_params(axis="both", labelsize=self.tick_fontsize)
-
- if kwargs.get("figsize"):
- # Convert from pixels to inches
- fig.set_size_inches(
- kwargs["figsize"][0] // fig.get_dpi(),
- kwargs["figsize"][1] // fig.get_dpi(),
- )
- plt.tight_layout()
- if kwargs.get("filename"):
- fig.savefig(name)
-
- # Log plot to mlflow run of every model visualized
- if self.experiment and self.log_plots:
- for m in set(BasePlot._fig.used_models):
- MlflowClient().log_figure(
- run_id=m._run.info.run_id,
- figure=fig,
- artifact_file=name if "." in name else f"{name}.png",
- )
-
- plt.show() if kwargs.get("display") else plt.close()
- if kwargs.get("display") is None:
- return fig
-
- @composed(contextmanager, crash)
- def canvas(
- self,
- rows: INT = 1,
- cols: INT = 2,
- *,
- horizontal_spacing: FLOAT = 0.05,
- vertical_spacing: FLOAT = 0.07,
- title: str | dict | None = None,
- legend: str | dict | None = "out",
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: BOOL = True,
- ):
- """Create a figure with multiple plots.
-
- This `@contextmanager` allows you to draw many plots in one
- figure. The default option is to add two plots side by side.
- See the [user guide][canvas] for an example.
-
- Parameters
- ----------
- rows: int, default=1
- Number of plots in length.
-
- cols: int, default=2
- Number of plots in width.
-
- horizontal_spacing: float, default=0.05
- Space between subplot rows in normalized plot coordinates.
- The spacing is relative to the figure's size.
-
- vertical_spacing: float, default=0.07
- Space between subplot cols in normalized plot coordinates.
- The spacing is relative to the figure's size.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: bool, str or dict, default="out"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of plots in the canvas.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool, default=True
- Whether to render the plot.
-
- Yields
- ------
- [go.Figure][]
- Plot object.
-
- """
- BasePlot._fig = BaseFigure(
- rows=rows,
- cols=cols,
- horizontal_spacing=horizontal_spacing,
- vertical_spacing=vertical_spacing,
- palette=self.palette,
- is_canvas=True,
- )
-
- try:
- yield BasePlot._fig.figure
- finally:
- BasePlot._fig.is_canvas = False # Close the canvas
- self._plot(
- groupclick="togglegroup",
- title=title,
- legend=legend,
- figsize=figsize or (550 + 350 * cols, 200 + 400 * rows),
- plotname="canvas",
- filename=filename,
- display=display,
- )
-
- def reset_aesthetics(self):
- """Reset the plot [aesthetics][] to their default values."""
- self._custom_layout = {}
- self._custom_traces = {}
- self._aesthetics = Aesthetics(
- palette=PALETTE,
- title_fontsize=24,
- label_fontsize=16,
- tick_fontsize=12,
- line_width=2,
- marker_size=8,
- )
-
- def update_layout(self, **kwargs):
- """Update the properties of the plot's layout.
-
- Recursively update the structure of the original layout with
- the values in the arguments.
-
- Parameters
- ----------
- **kwargs
- Keyword arguments for the figure's [update_layout][] method.
-
- """
- self._custom_layout = kwargs
-
- def update_traces(self, **kwargs):
- """Update the properties of the plot's traces.
-
- Recursively update the structure of the original traces with
- the values in the arguments.
-
- Parameters
- ----------
- **kwargs
- Keyword arguments for the figure's [update_traces][] method.
-
- """
- self._custom_traces = kwargs
-
-
-@typechecked
-class FeatureSelectorPlot(BasePlot):
- """Feature selection plots.
-
- These plots are accessible from atom or from the FeatureSelector
- class when the appropriate feature selection strategy is used.
-
- """
-
- @available_if(has_attr("pca"))
- @crash
- def plot_components(
- self,
- show: INT | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the explained variance ratio per component.
-
- Kept components are colored and discarted components are
- transparent. This plot is available only when feature selection
- was applied with strategy="pca".
-
- Parameters
- ----------
- show: int or None, default=None
- Number of components to show. None to show all.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of components shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:FeatureSelectorPlot.plot_pca
- atom.plots:FeatureSelectorPlot.plot_rfecv
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.feature_selection("pca", n_features=5)
- atom.plot_components(show=10)
- ```
-
- """
- if show is None or show > self.pca.components_.shape[0]:
- # Limit max features shown to avoid maximum figsize error
- show = min(200, self.pca.components_.shape[0])
- elif show < 1:
- raise ValueError(
- "Invalid value for the show parameter. "
- f"Value should be >0, got {show}."
- )
-
- # Get the variance ratio per component
- variance = np.array(self.pca.explained_variance_ratio_)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- # Create color scheme: first normal and then fully transparent
- color = BasePlot._fig.get_elem("components")
- opacity = [0.2] * self.pca._comps + [0] * (len(variance) - self.pca._comps)
-
- fig.add_trace(
- go.Bar(
- x=variance,
- y=[f"pca{str(i)}" for i in range(len(variance))],
- orientation="h",
- marker=dict(
- color=[f"rgba({color[4:-1]}, {o})" for o in opacity],
- line=dict(width=2, color=color),
- ),
- hovertemplate="%{x}",
- name=f"Variance retained: {variance[:self.pca._comps].sum():.3f}",
- legendgroup="components",
- showlegend=BasePlot._fig.showlegend("components", legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout({f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending")})
-
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Explained variance ratio",
- ylim=(len(variance) - show - 0.5, len(variance) - 0.5),
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- plotname="plot_components",
- filename=filename,
- display=display,
- )
-
- @available_if(has_attr("pca"))
- @crash
- def plot_pca(
- self,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the explained variance ratio vs number of components.
-
- If the underlying estimator is [PCA][] (for dense datasets),
- all possible components are plotted. If the underlying estimator
- is [TruncatedSVD][] (for sparse datasets), it only shows the
- selected components. The star marks the number of components
- selected by the user. This plot is available only when feature
- selection was applied with strategy="pca".
-
- Parameters
- ----------
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:FeatureSelectorPlot.plot_components
- atom.plots:FeatureSelectorPlot.plot_rfecv
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.feature_selection("pca", n_features=5)
- atom.plot_pca()
- ```
-
- """
- # Create star symbol at selected number of components
- symbols = ["circle"] * self.pca.n_features_in_
- symbols[self.pca._comps - 1] = "star"
- sizes = [self.marker_size] * self.pca.n_features_in_
- sizes[self.pca._comps - 1] = self.marker_size * 1.5
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- fig.add_trace(
- go.Scatter(
- x=tuple(range(1, self.pca.n_features_in_ + 1)),
- y=np.cumsum(self.pca.explained_variance_ratio_),
- mode="lines+markers",
- line=dict(width=self.line_width, color=BasePlot._fig.get_elem("pca")),
- marker=dict(
- symbol=symbols,
- size=sizes,
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- opacity=1,
- ),
- hovertemplate="%{y}",
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout(
- {
- "hovermode": "x",
- f"xaxis{xaxis[1:]}_showspikes": True,
- f"yaxis{yaxis[1:]}_showspikes": True,
- }
- )
-
- margin = self.pca.n_features_in_ / 30
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="First N principal components",
- ylabel="Cumulative variance ratio",
- xlim=(1 - margin, self.pca.n_features_in_ - 1 + margin),
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_pca",
- filename=filename,
- display=display,
- )
-
- @available_if(has_attr("rfecv"))
- @crash
- def plot_rfecv(
- self,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the rfecv results.
-
- Plot the scores obtained by the estimator fitted on every
- subset of the dataset. Only available when feature selection
- was applied with strategy="rfecv".
-
- Parameters
- ----------
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:FeatureSelectorPlot.plot_components
- atom.plots:FeatureSelectorPlot.plot_pca
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.feature_selection("rfecv", solver="Tree")
- atom.plot_rfecv()
- ```
-
- """
- try: # Define the y-label for the plot
- ylabel = self.rfecv.get_params()["scoring"].name
- except AttributeError:
- ylabel = "accuracy" if self.goal.startswith("class") else "r2"
-
- x = range(self.rfecv.min_features_to_select, self.rfecv.n_features_in_ + 1)
-
- # Create star symbol at selected number of features
- sizes = [6] * len(x)
- sizes[self.rfecv.n_features_ - self.rfecv.min_features_to_select] = 12
- symbols = ["circle"] * len(x)
- symbols[self.rfecv.n_features_ - self.rfecv.min_features_to_select] = "star"
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- mean = self.rfecv.cv_results_["mean_test_score"]
- std = self.rfecv.cv_results_["std_test_score"]
-
- fig.add_trace(
- go.Scatter(
- x=list(x),
- y=mean,
- mode="lines+markers",
- line=dict(width=self.line_width, color=BasePlot._fig.get_elem("rfecv")),
- marker=dict(
- symbol=symbols,
- size=sizes,
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- opacity=1,
- ),
- name=ylabel,
- legendgroup="rfecv",
- showlegend=BasePlot._fig.showlegend("rfecv", legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- # Add error bands
- fig.add_traces(
- [
- go.Scatter(
- x=tuple(x),
- y=mean + std,
- mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem("rfecv")),
- hovertemplate="%{y}upper bound",
- legendgroup="rfecv",
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- ),
- go.Scatter(
- x=tuple(x),
- y=mean - std,
- mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem("rfecv")),
- fill="tonexty",
- fillcolor=f"rgba{BasePlot._fig.get_elem('rfecv')[3:-1]}, 0.2)",
- hovertemplate="%{y}lower bound",
- legendgroup="rfecv",
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- ),
- ]
- )
-
- fig.update_layout({"hovermode": "x unified"})
-
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- groupclick="togglegroup",
- xlabel="Number of features",
- ylabel=ylabel,
- xlim=(min(x) - len(x) / 30, max(x) + len(x) / 30),
- ylim=(min(mean) - 3 * max(std), max(mean) + 3 * max(std)),
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_rfecv",
- filename=filename,
- display=display,
- )
-
-
-@typechecked
-class DataPlot(BasePlot):
- """Data plots.
-
- Plots used for understanding and interpretation of the dataset.
- They are only accessible from atom since. The other runners should
- be used for model training only, not for data manipulation.
-
- """
-
- @crash
- def plot_correlation(
- self,
- columns: slice | SEQUENCE | None = None,
- method: str = "pearson",
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] = (800, 700),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot a correlation matrix.
-
- Displays a heatmap showing the correlation between columns in
- the dataset. The colors red, blue and white stand for positive,
- negative, and no correlation respectively.
-
- Parameters
- ----------
- columns: slice, sequence or None, default=None
- Columns to plot. If None, plot all columns in the dataset.
- Selected categorical columns are ignored.
-
- method: str, default="pearson"
- Method of correlation. Choose from: pearson, kendall or
- spearman.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple, default=(800, 700)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:DataPlot.plot_distribution
- atom.plots:DataPlot.plot_qq
- atom.plots:DataPlot.plot_relationships
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.plot_correlation()
- ```
-
- """
- columns = self.branch._get_columns(columns, only_numerical=True)
- if method.lower() not in ("pearson", "kendall", "spearman"):
- raise ValueError(
- f"Invalid value for the method parameter, got {method}. "
- "Choose from: pearson, kendall or spearman."
- )
-
- # Compute the correlation matrix
- corr = self.dataset[columns].corr(method=method.lower())
-
- # Generate a mask for the lower triangle
- # k=1 means keep outermost diagonal line
- mask = np.zeros_like(corr, dtype=bool)
- mask[np.triu_indices_from(mask, k=1)] = True
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes(
- x=(0, 0.87),
- coloraxis=dict(
- colorscale="rdbu_r",
- cmin=-1,
- cmax=1,
- title=f"{method.lower()} correlation",
- font_size=self.label_fontsize,
- ),
- )
-
- fig.add_trace(
- go.Heatmap(
- z=corr.mask(mask),
- x=columns,
- y=columns,
- coloraxis=f"coloraxis{xaxis[1:]}",
- hovertemplate="x:%{x}
y:%{y}
z:%{z}",
- hoverongaps=False,
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout(
- {
- "template": "plotly_white",
- f"yaxis{yaxis[1:]}_autorange": "reversed",
- f"xaxis{xaxis[1:]}_showgrid": False,
- f"yaxis{yaxis[1:]}_showgrid": False,
- }
- )
-
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_correlation",
- filename=filename,
- display=display,
- )
-
- @crash
- def plot_distribution(
- self,
- columns: SLICE = 0,
- distributions: str | SEQUENCE | None = None,
- show: INT | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper right",
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot column distributions.
-
- - For numerical columns, plot the probability density
- distribution. Additionally, it's possible to plot any of
- `scipy.stats` distributions fitted to the column.
- - For categorical columns, plot the class distribution.
- Only one categorical column can be plotted at the same time.
-
- !!! tip
- Use atom's [distribution][atomclassifier-distribution]
- method to check which distribution fits the column best.
-
- Parameters
- ----------
- columns: int, str, slice or sequence, default=0
- Columns to plot. I's only possible to plot one categorical
- column. If more than one categorical columns are selected,
- all categorical columns are ignored.
-
- distributions: str, sequence or None, default=None
- Names of the `scipy.stats` distributions to fit to the
- columns. If None, a [Gaussian kde distribution][kde] is
- showed. Only for numerical columns.
-
- show: int or None, default=None
- Number of classes (ordered by number of occurrences) to
- show in the plot. If None, it shows all classes. Only for
- categorical columns.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None: No title is shown.
- - If str: Text for the title.
- - If dict: [title configuration][parameters].
-
- legend: str, dict or None, default="upper right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the plot's type.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:DataPlot.plot_correlation
- atom.plots:DataPlot.plot_qq
- atom.plots:DataPlot.plot_relationships
-
- Examples
- --------
- ```pycon
- import numpy as np
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- # Add a categorical feature
- animals = ["cat", "dog", "bird", "lion", "zebra"]
- probabilities = [0.001, 0.1, 0.2, 0.3, 0.399]
- X["animals"] = np.random.choice(animals, size=len(X), p=probabilities)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.plot_distribution(columns=[0, 1])
- atom.plot_distribution(columns=0, distributions=["norm", "invgauss"])
- atom.plot_distribution(columns="animals")
- ```
-
- """
- columns = self.branch._get_columns(columns)
- cat_columns = list(self.dataset.select_dtypes(exclude="number").columns)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- if len(columns) == 1 and columns[0] in cat_columns:
- series = self.dataset[columns[0]].value_counts(ascending=True)
-
- if show is None or show > len(series):
- show = len(series)
- elif show < 1:
- raise ValueError(
- "Invalid value for the show parameter."
- f"Value should be >0, got {show}."
- )
-
- color = BasePlot._fig.get_elem()
- fig.add_trace(
- go.Bar(
- x=series,
- y=series.index,
- orientation="h",
- marker=dict(
- color=f"rgba({color[4:-1]}, 0.2)",
- line=dict(width=2, color=color),
- ),
- hovertemplate="%{x}",
- name=f"{columns[0]}: {len(series)} classes",
- showlegend=BasePlot._fig.showlegend("dist", legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Counts",
- ylim=(len(series) - show - 0.5, len(series) - 0.5),
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- plotname="plot_distribution",
- filename=filename,
- display=display,
- )
-
- else:
- for col in [c for c in columns if c not in cat_columns]:
- fig.add_trace(
- go.Histogram(
- x=self.dataset[col],
- histnorm="probability density",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(col)),
- ),
- nbinsx=40,
- name="dist",
- legendgroup=col,
- legendgrouptitle=dict(text=col, font_size=self.label_fontsize),
- showlegend=BasePlot._fig.showlegend(f"{col}-dist", legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- x = np.linspace(self.dataset[col].min(), self.dataset[col].max(), 200)
-
- # Drop missing values for compatibility with scipy.stats
- missing = self.missing + [np.inf, -np.inf]
- values = self.dataset[col].replace(missing, np.NaN).dropna()
-
- if distributions:
- # Get a line for each distribution
- for j, dist in enumerate(lst(distributions)):
- params = getattr(stats, dist).fit(values)
-
- fig.add_trace(
- self._draw_line(
- x=x,
- y=getattr(stats, dist).pdf(x, *params),
- parent=col,
- child=dist,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
- else:
- # If no distributions specified, draw Gaussian kde
- fig.add_trace(
- self._draw_line(
- x=x,
- y=stats.gaussian_kde(values)(x),
- parent=col,
- child="kde",
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout(dict(barmode="overlay"))
-
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Values",
- ylabel="Probability density",
- title=title,
- legend=legend,
- figsize=figsize or (900, 600),
- plotname="plot_distribution",
- filename=filename,
- display=display,
- )
-
- @crash
- def plot_ngrams(
- self,
- ngram: INT | str = "bigram",
- index: SLICE | None = None,
- show: INT = 10,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot n-gram frequencies.
-
- The text for the plot is extracted from the column named
- `corpus`. If there is no column with that name, an exception
- is raised. If the documents are not tokenized, the words are
- separated by spaces.
-
- !!! tip
- Use atom's [tokenize][atomclassifier-tokenize] method to
- separate the words creating n-grams based on their frequency
- in the corpus.
-
- Parameters
- ----------
- ngram: str or int, default="bigram"
- Number of contiguous words to search for (size of n-gram).
- Choose from: words (1), bigrams (2), trigrams (3),
- quadgrams (4).
-
- index: int, str, slice, sequence or None, default=None
- Documents in the corpus to include in the search. If None,
- it selects all documents in the dataset.
-
- show: int, default=10
- Number of n-grams (ordered by number of occurrences) to
- show in the plot.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of n-grams shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:DataPlot.plot_wordcloud
-
- Examples
- --------
- ```pycon
- import numpy as np
- from atom import ATOMClassifier
- from sklearn.datasets import fetch_20newsgroups
-
- X, y = fetch_20newsgroups(
- return_X_y=True,
- categories=["alt.atheism", "sci.med", "comp.windows.x"],
- shuffle=True,
- random_state=1,
- )
- X = np.array(X).reshape(-1, 1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.textclean()
- atom.textnormalize()
- atom.plot_ngrams()
- ```
-
- """
-
- def get_text(column: SERIES) -> SERIES:
- """Get the complete corpus as sequence of tokens.
-
- Parameters
- ----------
- column: series
- Column containing the corpus.
-
- Returns
- -------
- series
- Corpus of tokens.
-
- """
- if isinstance(column.iat[0], str):
- return column.apply(lambda row: row.split())
- else:
- return column
-
- corpus = get_corpus(self.X)
- rows = self.dataset.loc[self.branch._get_rows(index, return_test=False)]
-
- if str(ngram).lower() in ("1", "word", "words"):
- ngram = "words"
- series = pd.Series(
- [word for row in get_text(rows[corpus]) for word in row]
- ).value_counts(ascending=True)
- else:
- if str(ngram).lower() in ("2", "bigram", "bigrams"):
- ngram, finder = "bigrams", BigramCollocationFinder
- elif str(ngram).lower() in ("3", "trigram", "trigrams"):
- ngram, finder = "trigrams", TrigramCollocationFinder
- elif str(ngram).lower() in ("4", "quadgram", "quadgrams"):
- ngram, finder = "quadgrams", QuadgramCollocationFinder
- else:
- raise ValueError(
- f"Invalid value for the ngram parameter, got {ngram}. "
- "Choose from: words, bigram, trigram, quadgram."
- )
-
- ngram_fd = finder.from_documents(get_text(rows[corpus])).ngram_fd
- series = pd.Series(
- data=[x[1] for x in ngram_fd.items()],
- index=[" ".join(x[0]) for x in ngram_fd.items()],
- ).sort_values(ascending=True)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- fig.add_trace(
- go.Bar(
- x=(data := series[-show:]),
- y=data.index,
- orientation="h",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(ngram)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(ngram)),
- ),
- hovertemplate="%{x}",
- name=f"Total {ngram}: {len(series)}",
- legendgroup=ngram,
- showlegend=BasePlot._fig.showlegend(ngram, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Counts",
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- plotname="plot_ngrams",
- filename=filename,
- display=display,
- )
-
- @crash
- def plot_qq(
- self,
- columns: SLICE = 0,
- distributions: str | SEQUENCE = "norm",
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot a quantile-quantile plot.
-
- Columns are distinguished by color and the distributions are
- distinguished by marker type. Missing values are ignored.
-
- Parameters
- ----------
- columns: int, str, slice or sequence, default=0
- Columns to plot. Selected categorical columns are ignored.
-
- distributions: str or sequence, default="norm"
- Names of the `scipy.stats` distributions to fit to the
- columns.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:DataPlot.plot_correlation
- atom.plots:DataPlot.plot_distribution
- atom.plots:DataPlot.plot_relationships
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.plot_qq(columns=[5, 6])
- atom.plot_qq(columns=0, distributions=["norm", "invgauss", "triang"])
- ```
-
- """
- columns = self.branch._get_columns(columns)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- percentiles = np.linspace(0, 100, 101)
- for col in columns:
- # Drop missing values for compatibility with scipy.stats
- missing = self.missing + [np.inf, -np.inf]
- values = self.dataset[col].replace(missing, np.NaN).dropna()
-
- for dist in lst(distributions):
- stat = getattr(stats, dist)
- params = stat.fit(values)
- samples = stat.rvs(*params, size=101, random_state=self.random_state)
-
- fig.add_trace(
- self._draw_line(
- x=np.percentile(samples, percentiles),
- y=np.percentile(values, percentiles),
- mode="markers",
- parent=col,
- child=dist,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis)
-
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Theoretical quantiles",
- ylabel="Observed quantiles",
- title=title,
- legend=legend,
- figsize=figsize or (900, 600),
- plotname="plot_qq",
- filename=filename,
- display=display,
- )
-
- @crash
- def plot_relationships(
- self,
- columns: slice | SEQUENCE = (0, 1, 2),
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] = (900, 900),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot pairwise relationships in a dataset.
-
- Creates a grid of axes such that each numerical column appears
- once on the x-axes and once on the y-axes. The bottom triangle
- contains scatter plots (max 250 random samples), the diagonal
- plots contain column distributions, and the upper triangle
- contains contour histograms for all samples in the columns.
-
- Parameters
- ----------
- columns: slice or sequence, default=(0, 1, 2)
- Columns to plot. Selected categorical columns are ignored.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple, default=(900, 900)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:DataPlot.plot_correlation
- atom.plots:DataPlot.plot_distribution
- atom.plots:DataPlot.plot_qq
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.plot_relationships(columns=[0, 4, 5])
- ```
-
- """
- columns = self.branch._get_columns(columns, only_numerical=True)
-
- # Use max 250 samples to not clutter the plot
- sample = lambda col: self.dataset[col].sample(
- n=min(len(self.dataset), 250), random_state=self.random_state
- )
-
- fig = self._get_figure()
- color = BasePlot._fig.get_elem()
- for i in range(len(columns)**2):
- x, y = i // len(columns), i % len(columns)
-
- # Calculate the distance between subplots
- offset = divide(0.0125, (len(columns) - 1))
-
- # Calculate the size of the subplot
- size = (1 - ((offset * 2) * (len(columns) - 1))) / len(columns)
-
- # Determine the position for the axes
- x_pos = y * (size + 2 * offset)
- y_pos = (len(columns) - x - 1) * (size + 2 * offset)
-
- xaxis, yaxis = BasePlot._fig.get_axes(
- x=(x_pos, rnd(x_pos + size)),
- y=(y_pos, rnd(y_pos + size)),
- coloraxis=dict(
- colorscale=PALETTE.get(color, "Blues"),
- cmin=0,
- cmax=len(self.dataset),
- showscale=False,
- )
- )
-
- if x == y:
- fig.add_trace(
- go.Histogram(
- x=self.dataset[columns[x]],
- marker=dict(
- color=f"rgba({color[4:-1]}, 0.2)",
- line=dict(width=2, color=color),
- ),
- name=columns[x],
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
- elif x > y:
- fig.add_trace(
- go.Scatter(
- x=sample(columns[y]),
- y=sample(columns[x]),
- mode="markers",
- marker=dict(color=color),
- hovertemplate="(%{x}, %{y})",
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
- elif y > x:
- fig.add_trace(
- go.Histogram2dContour(
- x=self.dataset[columns[y]],
- y=self.dataset[columns[x]],
- coloraxis=f"coloraxis{xaxis[1:]}",
- hovertemplate="x:%{x}
y:%{y}
z:%{z}",
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- if x < len(columns) - 1:
- fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False})
- if y > 0:
- fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False})
-
- self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel=columns[y] if x == len(columns) - 1 else None,
- ylabel=columns[x] if y == 0 else None,
- )
-
- return self._plot(
- title=title,
- legend=legend,
- figsize=figsize or (900, 900),
- plotname="plot_relationships",
- filename=filename,
- display=display,
- )
-
- @crash
- def plot_wordcloud(
- self,
- index: SLICE | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- **kwargs,
- ) -> go.Figure | None:
- """Plot a wordcloud from the corpus.
-
- The text for the plot is extracted from the column named
- `corpus`. If there is no column with that name, an exception
- is raised.
-
- Parameters
- ----------
- index: int, str, slice, sequence or None, default=None
- Documents in the corpus to include in the wordcloud. If
- None, it selects all documents in the dataset.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- **kwargs
- Additional keyword arguments for the [Wordcloud][] object.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:DataPlot.plot_ngrams
- atom.plots:PredictionPlot.plot_pipeline
-
- Examples
- --------
- ```pycon
- import numpy as np
- from atom import ATOMClassifier
- from sklearn.datasets import fetch_20newsgroups
-
- X, y = fetch_20newsgroups(
- return_X_y=True,
- categories=["alt.atheism", "sci.med", "comp.windows.x"],
- shuffle=True,
- random_state=1,
- )
- X = np.array(X).reshape(-1, 1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.textclean()
- atom.textnormalize()
- atom.plot_wordcloud()
- ```
-
- """
-
- def get_text(column):
- """Get the complete corpus as one long string."""
- if isinstance(column.iat[0], str):
- return " ".join(column)
- else:
- return " ".join([" ".join(row) for row in column])
-
- check_dependency("wordcloud")
- from wordcloud import WordCloud
-
- corpus = get_corpus(self.X)
- rows = self.dataset.loc[self.branch._get_rows(index, return_test=False)]
-
- wordcloud = WordCloud(
- width=figsize[0],
- height=figsize[1],
- background_color=kwargs.pop("background_color", "white"),
- random_state=kwargs.pop("random_state", self.random_state),
- **kwargs,
- )
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- fig.add_trace(
- go.Image(
- z=wordcloud.generate(get_text(rows[corpus])),
- hoverinfo="skip",
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout(
- {
- f"xaxis{xaxis[1:]}_showticklabels": False,
- f"yaxis{xaxis[1:]}_showticklabels": False,
- }
- )
-
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- title=title,
- legend=legend,
- figsize=figsize or (900, 600),
- plotname="plot_wordcloud",
- filename=filename,
- display=display,
- )
-
-
-@typechecked
-class HTPlot(BasePlot):
- """Hyperparameter tuning plots.
-
- Plots that help interpret the model's study and corresponding
- trials. These plots are accessible from the runners or from the
- models. If called from a runner, the `models` parameter has to be
- specified (if None, uses all models). If called from a model, that
- model is used and the `models` parameter becomes unavailable.
-
- """
-
- @composed(crash, plot_from_model)
- def plot_edf(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- metric: INT | str | SEQUENCE | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper left",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the Empirical Distribution Function of a study.
-
- Use this plot to analyze and improve hyperparameter search
- spaces. The EDF assumes that the value of the objective
- function is in accordance with the uniform distribution over
- the objective space. This plot is only available for models
- that ran [hyperparameter tuning][].
-
- !!! note
- Only complete trials are considered when plotting the EDF.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models that used hyperparameter
- tuning are selected.
-
- metric: int, str, sequence or None, default=None
- Metric to plot (only for multi-metric runs). If str, add `+`
- between options to select more than one. If None, the metric
- used to run the pipeline is selected.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper left"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:HTPlot.plot_hyperparameters
- atom.plots:HTPlot.plot_trials
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from optuna.distributions import IntDistribution
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
-
- # Run three models with different search spaces
- atom.run(
- models="RF_1",
- n_trials=10,
- ht_params={"distributions": {"n_estimators": IntDistribution(6, 10)}},
- )
- atom.run(
- models="RF_2",
- n_trials=10,
- ht_params={"distributions": {"n_estimators": IntDistribution(11, 15)}},
- )
- atom.run(
- models="RF_3",
- n_trials=10,
- ht_params={"distributions": {"n_estimators": IntDistribution(16, 20)}},
- )
-
- atom.plot_edf()
- ```
-
- """
- models = check_hyperparams(models, "plot_edf")
- metric = self._get_metric(metric, max_one=False)
-
- values = []
- for m in models:
- values.append([])
- for met in metric:
- values[-1].append(np.array([lst(row)[met] for row in m.trials["score"]]))
-
- x_min = np.nanmin(np.array(values))
- x_max = np.nanmax(np.array(values))
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m, val in zip(models, values):
- for met in metric:
- fig.add_trace(
- self._draw_line(
- x=(x := np.linspace(x_min, x_max, 100)),
- y=np.sum(val[met][:, np.newaxis] <= x, axis=0) / len(val[met]),
- parent=m.name,
- child=self._metric[met].name,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- ylim=(0, 1),
- xlabel="Score",
- ylabel="Cumulative Probability",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_edf",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model)
- def plot_hyperparameter_importance(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- metric: int | str = 0,
- show: INT | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot a model's hyperparameter importance.
-
- The hyperparameter importance are calculated using the
- [fANOVA][] importance evaluator. The sum of importances for all
- parameters (per model) is 1. This plot is only available for
- models that ran [hyperparameter tuning][].
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models that used hyperparameter
- tuning are selected.
-
- metric: int or str, default=0
- Metric to plot (only for multi-metric runs).
-
- show: int or None, default=None
- Number of hyperparameters (ordered by importance) to show.
- None to show all.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of hyperparameters shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_feature_importance
- atom.plots:HTPlot.plot_hyperparameters
- atom.plots:HTPlot.plot_trials
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["ET", "RF"], n_trials=10)
- atom.plot_hyperparameter_importance()
- ```
-
- """
- models = check_hyperparams(models, "plot_hyperparameter_importance")
- params = len(set([k for m in lst(models) for k in m._ht["distributions"]]))
- met = self._get_metric(metric, max_one=True)
-
- if show is None or show > params:
- # Limit max features shown to avoid maximum figsize error
- show = min(200, params)
- elif show < 1:
- raise ValueError(
- f"Invalid value for the show parameter. Value should be >0, got {show}."
- )
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- importances = FanovaImportanceEvaluator(seed=self.random_state).evaluate(
- study=m.study,
- target=None if len(self._metric) == 1 else lambda x: x.values[met],
- )
-
- fig.add_trace(
- go.Bar(
- x=np.array(list(importances.values())) / sum(importances.values()),
- y=list(importances.keys()),
- orientation="h",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- ),
- hovertemplate="%{x}",
- name=m.name,
- legendgroup=m.name,
- showlegend=BasePlot._fig.showlegend(m.name, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout(
- {
- f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
- "bargroupgap": 0.05,
- }
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Normalized hyperparameter importance",
- ylim=(params - show - 0.5, params - 0.5),
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- plotname="plot_hyperparameter_importance",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_hyperparameters(
- self,
- models: INT | str | MODEL | None = None,
- params: str | slice | SEQUENCE = (0, 1),
- metric: int | str = 0,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot hyperparameter relationships in a study.
-
- A model's hyperparameters are plotted against each other. The
- corresponding metric scores are displayed in a contour plot.
- The markers are the trials in the study. This plot is only
- available for models that ran [hyperparameter tuning][].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_hyperparameters()`.
-
- params: str, slice or sequence, default=(0, 1)
- Hyperparameters to plot. Use a sequence or add `+` between
- options to select more than one.
-
- metric: int or str, default=0
- Metric to plot (only for multi-metric runs).
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of hyperparameters shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:HTPlot.plot_hyperparameter_importance
- atom.plots:HTPlot.plot_parallel_coordinate
- atom.plots:HTPlot.plot_trials
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run("LR", n_trials=15)
- atom.plot_hyperparameters(params=(0, 1, 2))
- ```
-
- """
- m = check_hyperparams(models, "plot_hyperparameters")[0]
-
- if len(params := self._get_hyperparams(params, models)) < 2:
- raise ValueError(
- "Invalid value for the hyperparameters parameter. A minimum "
- f"of two parameters is required, got {len(params)}."
- )
-
- met = self._get_metric(metric, max_one=True)
-
- fig = self._get_figure()
- for i in range((length := len(params) - 1) ** 2):
- x, y = i // length, i % length
-
- if y <= x:
- # Calculate the size of the subplot
- size = 1 / length
-
- # Determine the position for the axes
- x_pos = y * size
- y_pos = (length - x - 1) * size
-
- xaxis, yaxis = BasePlot._fig.get_axes(
- x=(x_pos, rnd(x_pos + size)),
- y=(y_pos, rnd(y_pos + size)),
- coloraxis=dict(
- axes="99",
- colorscale=PALETTE.get(BasePlot._fig.get_elem(m.name), "Blues"),
- cmin=np.nanmin(
- m.trials.apply(lambda x: lst(x["score"])[met], axis=1)
- ),
- cmax=np.nanmax(
- m.trials.apply(lambda x: lst(x["score"])[met], axis=1)
- ),
- showscale=False,
- )
- )
-
- x_values = lambda row: row["params"].get(params[y], None)
- y_values = lambda row: row["params"].get(params[x + 1], None)
-
- fig.add_trace(
- go.Scatter(
- x=m.trials.apply(x_values, axis=1),
- y=m.trials.apply(y_values, axis=1),
- mode="markers",
- marker=dict(
- size=self.marker_size,
- color=BasePlot._fig.get_elem(m.name),
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- ),
- customdata=list(
- zip(
- m.trials.index.tolist(),
- m.trials.apply(lambda x: lst(x["score"])[met], axis=1),
- )
- ),
- hovertemplate=(
- f"{params[y]}:%{{x}}
"
- f"{params[x + 1]}:%{{y}}
"
- f"{self._metric[met].name}:%{{customdata[1]:.4f}}"
- "Trial %{customdata[0]}"
- ),
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.add_trace(
- go.Contour(
- x=m.trials.apply(x_values, axis=1),
- y=m.trials.apply(y_values, axis=1),
- z=m.trials.apply(lambda i: lst(i["score"])[met], axis=1),
- contours=dict(
- showlabels=True,
- labelfont=dict(size=self.tick_fontsize, color="white")
- ),
- coloraxis="coloraxis99",
- hoverinfo="skip",
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- if _is_log_scale(m.study.trials, params[y]):
- fig.update_layout({f"xaxis{xaxis[1:]}_type": "log"})
- if _is_log_scale(m.study.trials, params[x + 1]):
- fig.update_layout({f"yaxis{xaxis[1:]}_type": "log"})
-
- if x < length - 1:
- fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False})
- if y > 0:
- fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False})
-
- fig.update_layout(
- {
- "template": "plotly_white",
- f"xaxis{xaxis[1:]}_showgrid": False,
- f"yaxis{yaxis[1:]}_showgrid": False,
- f"xaxis{yaxis[1:]}_zeroline": False,
- f"yaxis{yaxis[1:]}_zeroline": False,
- }
- )
-
- self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel=params[y] if x == length - 1 else None,
- ylabel=params[x + 1] if y == 0 else None,
- )
-
- BasePlot._fig.used_models.append(m)
- return self._plot(
- title=title,
- legend=legend,
- figsize=figsize or (800 + 100 * length, 500 + 100 * length),
- plotname="plot_hyperparameters",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_parallel_coordinate(
- self,
- models: INT | str | MODEL | None = None,
- params: str | slice | SEQUENCE | None = None,
- metric: INT | str = 0,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot high-dimensional parameter relationships in a study.
-
- Every line of the plot represents one trial. This plot is only
- available for models that ran [hyperparameter tuning][].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_parallel_coordinate()`.
-
- params: str, slice, sequence or None, default=None
- Hyperparameters to plot. Use a sequence or add `+` between
- options to select more than one. If None, all the model's
- hyperparameters are selected.
-
- metric: int or str, default=0
- Metric to plot (only for multi-metric runs).
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of hyperparameters shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:HTPlot.plot_edf
- atom.plots:HTPlot.plot_hyperparameter_importance
- atom.plots:HTPlot.plot_hyperparameters
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run("RF", n_trials=15)
- atom.plot_parallel_coordinate(params=slice(1, 5))
- ```
-
- """
-
- def sort_mixed_types(values: list[str]) -> list[str]:
- """Sort a sequence of numbers and strings.
-
- Numbers are converted and take precedence over strings.
-
- Parameters
- ----------
- values: list
- Values to sort.
-
- Returns
- -------
- list of str
- Sorted values.
-
- """
- numbers, categorical = [], []
- for elem in values:
- try:
- numbers.append(it(float(elem)))
- except (TypeError, ValueError):
- categorical.append(str(elem))
-
- return list(map(str, sorted(numbers))) + sorted(categorical)
-
- m = check_hyperparams(models, "plot_parallel_coordinate")[0]
- params = self._get_hyperparams(params, models)
- met = self._get_metric(metric, max_one=True)
-
- dims = _get_dims_from_info(
- _get_parallel_coordinate_info(
- study=m.study,
- params=params,
- target=None if len(self._metric) == 1 else lambda x: x.values[met],
- target_name=self._metric[met].name,
- )
- )
-
- # Clean up dimensions for nicer view
- for d in [dims[0]] + sorted(dims[1:], key=lambda x: params.index(x["label"])):
- if "ticktext" in d:
- # Skip processing for logarithmic params
- if all(isinstance(i, INT_TYPES) for i in d["values"]):
- # Order categorical values
- mapping = [d["ticktext"][i] for i in d["values"]]
- d["ticktext"] = sort_mixed_types(d["ticktext"])
- d["values"] = [d["ticktext"].index(v) for v in mapping]
- else:
- # Round numerical values
- d["tickvals"] = list(
- map(rnd, np.linspace(min(d["values"]), max(d["values"]), 5))
- )
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes(
- coloraxis=dict(
- colorscale=PALETTE.get(BasePlot._fig.get_elem(m.name), "Blues"),
- cmin=min(dims[0]["values"]),
- cmax=max(dims[0]["values"]),
- title=self._metric[met].name,
- font_size=self.label_fontsize,
- )
- )
-
- fig.add_trace(
- go.Parcoords(
- dimensions=dims,
- line=dict(
- color=dims[0]["values"],
- coloraxis=f"coloraxis{xaxis[1:]}",
- ),
- unselected=dict(line=dict(color="gray", opacity=0.5)),
- labelside="bottom",
- labelfont=dict(size=self.label_fontsize),
- )
- )
-
- BasePlot._fig.used_models.append(m)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- title=title,
- legend=legend,
- figsize=figsize or (700 + len(params) * 50, 600),
- plotname="plot_parallel_coordinate",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_pareto_front(
- self,
- models: INT | str | MODEL | None = None,
- metric: str | SEQUENCE | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the Pareto front of a study.
-
- Shows the trial scores plotted against each other. The marker's
- colors indicate the trial number. This plot is only available
- for models that ran [multi-metric runs][] with
- [hyperparameter tuning][].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_pareto_front()`.
-
- metric: str, sequence or None, default=None
- Metrics to plot. Use a sequence or add `+` between options
- to select more than one. If None, the metrics used to run
- the pipeline are selected.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of metrics shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:HTPlot.plot_edf
- atom.plots:HTPlot.plot_slice
- atom.plots:HTPlot.plot_trials
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(
- models="RF",
- metric=["f1", "accuracy", "recall"],
- n_trials=15,
- )
- atom.plot_pareto_front()
- ```
-
- """
- m = check_hyperparams(models, "plot_pareto_front")[0]
-
- if len(metric := self._get_metric(metric, max_one=False)) < 2:
- raise ValueError(
- "Invalid value for the metric parameter. A minimum "
- f"of two metrics are required, got {len(metric)}."
- )
-
- fig = self._get_figure()
- for i in range((length := len(metric) - 1) ** 2):
- x, y = i // length, i % length
-
- if y <= x:
- # Calculate the distance between subplots
- offset = divide(0.0125, length - 1)
-
- # Calculate the size of the subplot
- size = (1 - ((offset * 2) * (length - 1))) / length
-
- # Determine the position for the axes
- x_pos = y * (size + 2 * offset)
- y_pos = (length - x - 1) * (size + 2 * offset)
-
- xaxis, yaxis = BasePlot._fig.get_axes(
- x=(x_pos, rnd(x_pos + size)),
- y=(y_pos, rnd(y_pos + size)),
- )
-
- fig.add_trace(
- go.Scatter(
- x=m.trials.apply(lambda row: row["score"][y], axis=1),
- y=m.trials.apply(lambda row: row["score"][x + 1], axis=1),
- mode="markers",
- marker=dict(
- size=self.marker_size,
- color=m.trials.index,
- colorscale="Teal",
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- ),
- customdata=m.trials.index,
- hovertemplate="(%{x}, %{y})Trial %{customdata}",
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- if x < len(metric) - 1:
- fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False})
- if y > 0:
- fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False})
-
- self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel=self._metric[y].name if x == length - 1 else None,
- ylabel=self._metric[x + 1].name if y == 0 else None,
- )
-
- BasePlot._fig.used_models.append(m)
- return self._plot(
- title=title,
- legend=legend,
- figsize=figsize or (500 + 100 * length, 500 + 100 * length),
- plotname="plot_pareto_front",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_slice(
- self,
- models: INT | str | MODEL | None = None,
- params: str | slice | SEQUENCE | None = None,
- metric: INT | str | SEQUENCE | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the parameter relationship in a study.
-
- The color of the markers indicate the trial. This plot is only
- available for models that ran [hyperparameter tuning][].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_slice()`.
-
- params: str, slice, sequence or None, default=None
- Hyperparameters to plot. Use a sequence or add `+` between
- options to select more than one. If None, all the model's
- hyperparameters are selected.
-
- metric: int or str, default=None
- Metric to plot (only for multi-metric runs). If str, add `+`
- between options to select more than one. If None, the metric
- used to run the pipeline is selected.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of hyperparameters shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:HTPlot.plot_edf
- atom.plots:HTPlot.plot_hyperparameters
- atom.plots:HTPlot.plot_parallel_coordinate
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(
- models="RF",
- metric=["f1", "recall"],
- n_trials=15,
- )
- atom.plot_slice(params=(0, 1, 2))
- ```
-
- """
- m = check_hyperparams(models, "plot_slice")[0]
- params = self._get_hyperparams(params, models)
- metric = self._get_metric(metric, max_one=False)
-
- fig = self._get_figure()
- for i in range(len(params) * len(metric)):
- x, y = i // len(params), i % len(params)
-
- # Calculate the distance between subplots
- x_offset = divide(0.0125, (len(params) - 1))
- y_offset = divide(0.0125, (len(metric) - 1))
-
- # Calculate the size of the subplot
- x_size = (1 - ((x_offset * 2) * (len(params) - 1))) / len(params)
- y_size = (1 - ((y_offset * 2) * (len(metric) - 1))) / len(metric)
-
- # Determine the position for the axes
- x_pos = y * (x_size + 2 * x_offset)
- y_pos = (len(metric) - x - 1) * (y_size + 2 * y_offset)
-
- xaxis, yaxis = BasePlot._fig.get_axes(
- x=(x_pos, rnd(x_pos + x_size)),
- y=(y_pos, rnd(y_pos + y_size)),
- )
-
- fig.add_trace(
- go.Scatter(
- x=m.trials.apply(lambda r: r["params"].get(params[y], None), axis=1),
- y=m.trials.apply(lambda r: lst(r["score"])[x], axis=1),
- mode="markers",
- marker=dict(
- size=self.marker_size,
- color=m.trials.index,
- colorscale="Teal",
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- ),
- customdata=m.trials.index,
- hovertemplate="(%{x}, %{y})Trial %{customdata}",
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- if _is_log_scale(m.study.trials, params[y]):
- fig.update_layout({f"xaxis{xaxis[1:]}_type": "log"})
-
- if x < len(metric) - 1:
- fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False})
- if y > 0:
- fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False})
-
- self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel=params[y] if x == len(metric) - 1 else None,
- ylabel=self._metric[x].name if y == 0 else None,
- )
-
- BasePlot._fig.used_models.append(m)
- return self._plot(
- title=title,
- legend=legend,
- figsize=figsize or (800 + 100 * len(params), 500 + 100 * len(metric)),
- plotname="plot_slice",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model)
- def plot_terminator_improvement(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the potentials for future objective improvement.
-
- This function visualizes the objective improvement potentials.
- It helps to determine whether you should continue the
- optimization or not. The evaluated error is also plotted. Note
- that this function may take some time to compute the improvement
- potentials. This plot is only available for models that ran
- [hyperparameter tuning][].
-
- !!! warning
- * The plot_terminator_improvement method is only available
- for models that ran [hyperparameter tuning][] using
- cross-validation, e.g. using `ht_params={'cv': 5}`.
- * This method can be slow. Results are cached to fasten
- repeated calls.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models that used hyperparameter
- tuning are selected.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper right",
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y)
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:HTPlot.plot_pareto_front
- atom.plots:HTPlot.plot_timeline
- atom.plots:HTPlot.plot_trials
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run("RF", n_trials=10, ht_params={"cv": 5})
- atom.plot_terminator_improvement()
- ```
-
- """
- check_dependency("botorch")
-
- models = check_hyperparams(models, "plot_terminator_improvement")
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- if m._ht["cv"] > 1:
- info = self._memory.cache(_get_improvement_info)(m.study, get_error=True)
- else:
- raise ValueError(
- "The plot_terminator_improvement method is only available for "
- "models that ran hyperparameter tuning using cross-validation, "
- "e.g. using ht_params={'cv': 5}."
- )
-
- fig.add_trace(
- self._draw_line(
- x=m.trials.index,
- y=info.improvements,
- error_y=dict(type="data", array=info.errors),
- mode="markers+lines",
- parent=m.name,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Trial",
- ylabel="Terminator improvement",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_terminator_improvement",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model)
- def plot_timeline(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the timeline of a study.
-
- This plot is only available for models that ran
- [hyperparameter tuning][].
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models that used hyperparameter
- tuning are selected.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right",
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y)
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:HTPlot.plot_edf
- atom.plots:HTPlot.plot_slice
- atom.plots:HTPlot.plot_terminator_improvement
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from optuna.pruners import PatientPruner
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(
- models="LGB",
- n_trials=15,
- ht_params={"pruner": PatientPruner(None, patience=2)},
- )
- atom.plot_timeline()
- ```
-
- """
- models = check_hyperparams(models, "plot_timeline")
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- _cm = {
- "COMPLETE": BasePlot._fig._palette[0], # Main color
- "FAIL": "rgb(255, 0, 0)", # Red
- "PRUNED": "rgb(255, 165, 0)", # Orange
- "RUNNING": "rgb(124, 252, 0)", # Green
- "WAITING": "rgb(220, 220, 220)", # Gray
- }
-
- for m in models:
- info = []
- for trial in m.study.get_trials(deepcopy=False):
- date_complete = trial.datetime_complete or datetime.now()
- date_start = trial.datetime_start or date_complete
-
- # Create nice representation of scores and params for hover
- s = [f'{m}: {trial.values[i]}' for i, m in enumerate(self._metric.keys())]
- p = [f" --> {k}: {v}" for k, v in trial.params.items()]
-
- info.append(
- Bunch(
- number=trial.number,
- start=date_start,
- duration=1000 * (date_complete - date_start).total_seconds(),
- state=trial.state,
- hovertext=(
- f"Trial: {trial.number}
"
- f"{'
'.join(s)}"
- f"Parameters:
{'
'.join(p)}"
- )
- )
- )
-
- for state in sorted(TrialState, key=lambda x: x.name):
- if bars := list(filter(lambda x: x.state == state, info)):
- fig.add_trace(
- go.Bar(
- name=state.name,
- x=[b.duration for b in bars],
- y=[b.number for b in bars],
- base=[b.start.isoformat() for b in bars],
- text=[b.hovertext for b in bars],
- textposition="none",
- hovertemplate=f"%{{text}}{m.name}",
- orientation="h",
- marker=dict(
- color=f"rgba({_cm[state.name][4:-1]}, 0.2)",
- line=dict(width=2, color=_cm[state.name]),
- ),
- showlegend=BasePlot._fig.showlegend(_cm[state.name], legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout({f"xaxis{yaxis[1:]}_type": "date", "barmode": "group"})
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Datetime",
- ylabel="Trial",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_timeline",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model)
- def plot_trials(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- metric: INT | str | SEQUENCE | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper left",
- figsize: tuple[INT, INT] = (900, 800),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the hyperparameter tuning trials.
-
- Creates a figure with two plots: the first plot shows the score
- of every trial and the second shows the distance between the
- last consecutive steps. The best trial is indicated with a star.
- This is the same plot as produced by `ht_params={"plot": True}`.
- This plot is only available for models that ran
- [hyperparameter tuning][].
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models that used hyperparameter
- tuning are selected.
-
- metric: int, str, sequence or None, default=None
- Metric to plot (only for multi-metric runs). Add `+` between
- options to select more than one. If None, all metrics are
- selected.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper left"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 800)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_evals
- atom.plots:HTPlot.plot_hyperparameters
- atom.plots:PredictionPlot.plot_results
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["ET", "RF"], n_trials=15)
- atom.plot_trials()
- ```
-
- """
- models = check_hyperparams(models, "plot_trials")
- metric = self._get_metric(metric, max_one=False)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes(y=(0.31, 1.0))
- xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.0, 0.29))
- for m in models:
- for met in metric:
- y = m.trials["score"].apply(lambda value: lst(value)[met])
-
- # Create star symbol at best trial
- symbols = ["circle"] * len(y)
- symbols[m.best_trial.number] = "star"
- sizes = [self.marker_size] * len(y)
- sizes[m.best_trial.number] = self.marker_size * 1.5
-
- fig.add_trace(
- self._draw_line(
- x=list(range(len(y))),
- y=y,
- mode="lines+markers",
- marker_symbol=symbols,
- marker_size=sizes,
- hovertemplate=None,
- parent=m.name,
- child=self._metric[met].name,
- legend=legend,
- xaxis=xaxis2,
- yaxis=yaxis,
- )
- )
-
- fig.add_trace(
- self._draw_line(
- x=list(range(1, len(y))),
- y=np.abs(np.diff(y)),
- mode="lines+markers",
- marker_symbol="circle",
- parent=m.name,
- child=self._metric[met].name,
- legend=legend,
- xaxis=xaxis2,
- yaxis=yaxis2,
- )
- )
-
- fig.update_layout(
- {
- f"yaxis{yaxis[1:]}_anchor": f"x{xaxis2[1:]}",
- f"xaxis{xaxis[1:]}_showticklabels": False,
- "hovermode": "x unified",
- },
- )
-
- self._plot(
- ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"),
- xlabel="Trial",
- ylabel="d",
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- groupclick="togglegroup",
- ylabel="Score",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_trials",
- filename=filename,
- display=display,
- )
-
-
-@typechecked
-class PredictionPlot(BasePlot):
- """Prediction plots.
-
- Plots that use the model's predictions. These plots are accessible
- from the runners or from the models. If called from a runner, the
- `models` parameter has to be specified (if None, uses all models).
- If called from a model, that model is used and the `models` parameter
- becomes unavailable.
-
- """
-
- @available_if(has_task(["binary", "multilabel"]))
- @composed(crash, plot_from_model)
- def plot_calibration(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str | SEQUENCE = "test",
- n_bins: INT = 10,
- target: INT | str = 0,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper left",
- figsize: tuple[INT, INT] = (900, 900),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the calibration curve for a binary classifier.
-
- Well calibrated classifiers are probabilistic classifiers for
- which the output of the `predict_proba` method can be directly
- interpreted as a confidence level. For instance a well
- calibrated (binary) classifier should classify the samples such
- that among the samples to which it gave a `predict_proba` value
- close to 0.8, approx. 80% actually belong to the positive class.
- Read more in sklearn's [documentation][calibration].
-
- This figure shows two plots: the calibration curve, where the
- x-axis represents the average predicted probability in each bin
- and the y-axis is the fraction of positives, i.e. the proportion
- of samples whose class is the positive class (in each bin); and
- a distribution of all predicted probabilities of the classifier.
- This plot is available only for models with a `predict_proba`
- method in a binary or [multilabel][] classification task.
-
- !!! tip
- Use the [calibrate][adaboost-calibrate] method to calibrate
- the winning model.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str or sequence, default="test"
- Data set on which to calculate the metric. Use a sequence
- or add `+` between options to select more than one. Choose
- from: "train", "test" or "holdout".
-
- target: int or str, default=0
- Target column to look at. Only for [multilabel][] tasks.
-
- n_bins: int, default=10
- Number of bins used for calibration. Minimum of 5 required.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper left"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 900)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_lift
- atom.plots:PredictionPlot.plot_prc
- atom.plots:PredictionPlot.plot_roc
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["RF", "LGB"])
- atom.plot_calibration()
- ```
-
- """
- check_predict_proba(models, "plot_calibration")
- dataset = self._get_set(dataset, max_one=False)
- target = self.branch._get_target(target, only_columns=True)
-
- if n_bins < 5:
- raise ValueError(
- "Invalid value for the n_bins parameter."
- f"Value should be >=5, got {n_bins}."
- )
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes(y=(0.31, 1.0))
- xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.0, 0.29))
- for m in models:
- for ds in dataset:
- y_true, y_pred = m._get_pred(ds, target, attr="predict_proba")
-
- # Get calibration (frac of positives and predicted values)
- frac_pos, pred = calibration_curve(y_true, y_pred, n_bins=n_bins)
-
- fig.add_trace(
- self._draw_line(
- x=pred,
- y=frac_pos,
- parent=m.name,
- child=ds,
- mode="lines+markers",
- marker_symbol="circle",
- legend=legend,
- xaxis=xaxis2,
- yaxis=yaxis,
- )
- )
-
- fig.add_trace(
- go.Histogram(
- x=y_pred,
- xbins=dict(start=0, end=1, size=1. / n_bins),
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- ),
- name=m.name,
- legendgroup=m.name,
- showlegend=False,
- xaxis=xaxis2,
- yaxis=yaxis2,
- )
- )
-
- self._draw_straight_line(y="diagonal", xaxis=xaxis2, yaxis=yaxis)
-
- fig.update_layout(
- {
- f"yaxis{yaxis[1:]}_anchor": f"x{xaxis2[1:]}",
- f"xaxis{xaxis2[1:]}_showgrid": True,
- "barmode": "overlay",
- }
- )
-
- self._plot(
- ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"),
- xlabel="Predicted value",
- ylabel="Count",
- xlim=(0, 1),
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- groupclick="togglegroup",
- ylabel="Fraction of positives",
- ylim=(-0.05, 1.05),
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_calibration",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task("class"))
- @composed(crash, plot_from_model)
- def plot_confusion_matrix(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str = "test",
- target: INT | str = 0,
- threshold: FLOAT = 0.5,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper right",
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot a model's confusion matrix.
-
- For one model, the plot shows a heatmap. For multiple models,
- it compares TP, FP, FN and TN in a barplot (not implemented
- for multiclass classification tasks). This plot is available
- only for classification tasks.
-
- !!! tip
- Fill the `threshold` parameter with the result from the
- model's `get_best_threshold` method to optimize the results.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str, default="test"
- Data set on which to calculate the confusion matrix. Choose
- from:` "train", "test" or "holdout".
-
- target: int or str, default=0
- Target column to look at. Only for [multioutput tasks][].
-
- threshold: float, default=0.5
- Threshold between 0 and 1 to convert predicted probabilities
- to class labels. Only for binary classification tasks.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the plot's type.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_calibration
- atom.plots:PredictionPlot.plot_threshold
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, test_size=0.4)
- atom.run(["LR", "RF"])
- atom.lr.plot_confusion_matrix() # For one model
- atom.plot_confusion_matrix() # For multiple models
- ```
-
- """
- ds = self._get_set(dataset, max_one=True)
- target = self.branch._get_target(target, only_columns=True)
-
- if self.task.startswith("multiclass") and len(models) > 1:
- raise NotImplementedError(
- "The plot_confusion_matrix method does not support "
- "the comparison of multiple models for multiclass "
- "or multiclass-multioutput classification tasks."
- )
-
- labels = np.array(
- (("True negatives", "False positives"), ("False negatives", "True positives"))
- )
-
- fig = self._get_figure()
- if len(models) == 1:
- xaxis, yaxis = BasePlot._fig.get_axes(
- x=(0, 0.87),
- coloraxis=dict(
- colorscale="Blues",
- cmin=0,
- cmax=100,
- title="Percentage of samples",
- font_size=self.label_fontsize,
- ),
- )
- else:
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- for m in models:
- y_true, y_pred = m._get_pred(ds, target, attr="predict")
- if threshold != 0.5:
- y_pred = (y_pred > threshold).astype("int")
-
- cm = confusion_matrix(y_true, y_pred)
- if len(models) == 1: # Create matrix heatmap
- ticks = m.mapping.get(target, np.unique(m.dataset[target]).astype(str))
- xaxis, yaxis = BasePlot._fig.get_axes(
- x=(0, 0.87),
- coloraxis=dict(
- colorscale="Blues",
- cmin=0,
- cmax=100,
- title="Percentage of samples",
- font_size=self.label_fontsize,
- ),
- )
-
- fig.add_trace(
- go.Heatmap(
- x=ticks,
- y=ticks,
- z=100. * cm / cm.sum(axis=1)[:, np.newaxis],
- coloraxis=f"coloraxis{xaxis[1:]}",
- text=cm,
- customdata=labels,
- texttemplate="%{text}
(%{z:.2f}%)",
- textfont=dict(size=self.label_fontsize),
- hovertemplate=(
- "%{customdata}
" if is_binary(self.task) else ""
- "x:%{x}
y:%{y}
z:%{z}"
- ),
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout(
- {
- "template": "plotly_white",
- f"yaxis{yaxis[1:]}_autorange": "reversed",
- f"xaxis{xaxis[1:]}_showgrid": False,
- f"yaxis{yaxis[1:]}_showgrid": False,
- }
- )
-
- else:
- color = BasePlot._fig.get_elem(m.name)
- fig.add_trace(
- go.Bar(
- x=cm.ravel(),
- y=labels.ravel(),
- orientation="h",
- marker=dict(
- color=f"rgba({color[4:-1]}, 0.2)",
- line=dict(width=2, color=color),
- ),
- hovertemplate="%{x}",
- name=m.name,
- legendgroup=m.name,
- showlegend=BasePlot._fig.showlegend(m.name, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout(bargroupgap=0.05)
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Predicted label" if len(models) == 1 else "Count",
- ylabel="True label" if len(models) == 1 else None,
- title=title,
- legend=legend,
- figsize=figsize or ((800, 800) if len(models) == 1 else (900, 600)),
- plotname="plot_confusion_matrix",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task(["binary", "multilabel"]))
- @composed(crash, plot_from_model)
- def plot_det(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str | SEQUENCE = "test",
- target: INT | str = 0,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ):
- """Plot the Detection Error Tradeoff curve.
-
- Read more about [DET][] in sklearn's documentation. Only
- available for binary classification tasks.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str or sequence, default="test"
- Data set on which to calculate the metric. Use a sequence
- or add `+` between options to select more than one. Choose
- from: "train", "test" or "holdout".
-
- target: int or str, default=0
- Target column to look at. Only for [multilabel][] tasks.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_gains
- atom.plots:PredictionPlot.plot_roc
- atom.plots:PredictionPlot.plot_prc
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["LR", "RF"])
- atom.plot_det()
- ```
-
- """
- dataset = self._get_set(dataset, max_one=False)
- target = self.branch._get_target(target, only_columns=True)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- for ds in dataset:
- # Get fpr-fnr pairs for different thresholds
- fpr, fnr, _ = det_curve(*m._get_pred(ds, target, attr="thresh"))
-
- fig.add_trace(
- self._draw_line(
- x=fpr,
- y=fnr,
- mode="lines",
- parent=m.name,
- child=ds,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="FPR",
- ylabel="FNR",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_det",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task("reg"))
- @composed(crash, plot_from_model)
- def plot_errors(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str = "test",
- target: INT | str = 0,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot a model's prediction errors.
-
- Plot the actual targets from a set against the predicted values
- generated by the regressor. A linear fit is made on the data.
- The gray, intersected line shows the identity line. This plot
- can be useful to detect noise or heteroscedasticity along a
- range of the target domain. This plot is available only for
- regression tasks.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str, default="test"
- Data set on which to calculate the metric. Choose from:
- "train", "test" or "holdout".
-
- target: int or str, default=0
- Target column to look at. Only for [multioutput tasks][].
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_residuals
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import load_diabetes
-
- X, y = load_diabetes(return_X_y=True, as_frame=True)
-
- atom = ATOMRegressor(X, y)
- atom.run(["OLS", "LGB"])
- atom.plot_errors()
- ```
-
- """
- ds = self._get_set(dataset, max_one=True)
- target = self.branch._get_target(target, only_columns=True)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- y_true, y_pred = m._get_pred(ds, target)
-
- fig.add_trace(
- go.Scatter(
- x=y_true,
- y=y_pred,
- mode="markers",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- name=m.name,
- legendgroup=m.name,
- showlegend=BasePlot._fig.showlegend(m.name, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- # Fit the points using linear regression
- from atom.models import OrdinaryLeastSquares
- model = OrdinaryLeastSquares(goal=self.goal, branch=m.branch)._get_est()
- model.fit(y_true.values.reshape(-1, 1), y_pred)
-
- fig.add_trace(
- go.Scatter(
- x=(x := np.linspace(y_true.min(), y_true.max(), 100)),
- y=model.predict(x[:, np.newaxis]),
- mode="lines",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- hovertemplate="(%{x}, %{y})",
- legendgroup=m.name,
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis)
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- groupclick="togglegroup",
- xlabel="True value",
- title=title,
- legend=legend,
- ylabel="Predicted value",
- figsize=figsize,
- plotname="plot_errors",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(ensembles=False))
- def plot_evals(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str | SEQUENCE = "test",
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot evaluation curves.
-
- The evaluation curves are the main metric scores achieved by the
- models at every iteration of the training process. This plot is
- available only for models that allow [in-training validation][].
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str or sequence, default="test"
- Data set on which to calculate the evaluation curves. Use a
- sequence or add `+` between options to select more than one.
- Choose from: "train" or "test".
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:HTPlot.plot_trials
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["XGB", "LGB"])
- atom.plot_evals()
- ```
-
- """
- dataset = self._get_set(dataset, max_one=False, allow_holdout=False)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- if not m.evals:
- raise ValueError(
- "Invalid value for the models parameter. Model "
- f"{m.name} has no in-training validation."
- )
-
- for ds in dataset:
- fig.add_trace(
- self._draw_line(
- x=list(range(len(m.evals[f"{self._metric[0].name}_{ds}"]))),
- y=m.evals[f"{self._metric[0].name}_{ds}"],
- marker_symbol="circle",
- parent=m.name,
- child=ds,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- BasePlot._fig.used_models.append(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Iterations",
- ylabel=self._metric[0].name,
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_evals",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model)
- def plot_feature_importance(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- show: INT | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot a model's feature importance.
-
- The sum of importances for all features (per model) is 1.
- This plot is available only for models whose estimator has
- a `scores_`, `feature_importances_` or `coef` attribute.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- show: int or None, default=None
- Number of features (ordered by importance) to show. If
- None, it shows all features.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of features shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_parshap
- atom.plots:PredictionPlot.plot_partial_dependence
- atom.plots:PredictionPlot.plot_permutation_importance
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["LR", "RF"])
- atom.plot_feature_importance(show=10)
- ```
-
- """
- show = self._get_show(show, models)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- if (fi := m.feature_importance) is None:
- raise ValueError(
- "Invalid value for the models parameter. The estimator "
- f"{m.estimator.__class__.__name__} has no feature_importances_ "
- "nor coef_ attribute."
- )
-
- fig.add_trace(
- go.Bar(
- x=fi,
- y=fi.index,
- orientation="h",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- ),
- hovertemplate="%{x}",
- name=m.name,
- legendgroup=m.name,
- showlegend=BasePlot._fig.showlegend(m.name, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout(
- {
- f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
- "bargroupgap": 0.05,
- }
- )
-
- # Unique number of features over all branches
- n_fxs = len(set([fx for m in models for fx in m.features]))
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Normalized feature importance",
- ylim=(n_fxs - show - 0.5, n_fxs - 0.5),
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- plotname="plot_feature_importance",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task("forecast"))
- @composed(crash, plot_from_model(check_fitted=False))
- def plot_forecast(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- fh: int | str | range | SEQUENCE | ForecastingHorizon = "test",
- X: FEATURES | None = None,
- target: INT | str = 0,
- plot_interval: bool = True,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper left",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot a time series with model forecasts.
-
- This plot is only available for forecasting tasks.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected. If no
- models are selected, only the target column is plotted.
-
- fh: int, str, range, sequence or [ForecastingHorizon][], default="test"
- Forecast horizon for which to plot the predictions. If
- string, choose from: "train", "test" or "holdout". Use a
- sequence or add `+` between options to select more than one.
-
- X: dataframe-like or None, default=None
- Exogenous time series corresponding to fh. This parameter
- is ignored if fh is a data set.
-
- target: int or str, default=0
- Target column to look at. Only for [multivariate][] tasks.
-
- plot_interval: bool, default=True
- Whether to plot prediction intervals instead of the exact
- prediction values. If True, the plotted estimators should
- have a `predict_interval` method.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper left"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_lift
- atom.plots:PredictionPlot.plot_prc
- atom.plots:PredictionPlot.plot_roc
-
- Examples
- --------
- ```pycon
- from atom import ATOMForecaster
- from sktime.datasets import load_airline
-
- y = load_airline()
-
- atom = ATOMForecaster(y, random_state=1)
- atom.plot_forecast()
- atom.run(
- models="arima",
- est_params={"order": (1, 1, 0), "seasonal_order": (0, 1, 0, 12)},
- )
- atom.plot_forecast()
- atom.plot_forecast(fh="train+test", plot_interval=False)
-
- # Forecast the next 4 years starting from the test set
- atom.plot_forecast(fh=range(1, 48))
- ```
-
- """
- target = self.branch._get_target(target, only_columns=True)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- # Draw original time series
- for ds in ("train", "test"):
- fig.add_trace(
- go.Scatter(
- x=self._get_plot_index(getattr(self, ds)),
- y=getattr(self, ds)[target],
- mode="lines+markers",
- line=dict(
- width=2,
- color="black",
- dash=BasePlot._fig.get_elem(ds, "dash"),
- ),
- opacity=0.6,
- name=ds,
- showlegend=False if models else BasePlot._fig.showlegend(ds, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- # Draw predictions
- for m in models:
- if isinstance(fh, str):
- # Get fh and corresponding X from data set
- datasets = self._get_set(fh, max_one=False)
- fh = bk.concat([getattr(m, ds) for ds in datasets]).index
- X = m.X.loc[fh]
-
- y_pred = m.predict(fh, X)
- if is_multioutput(self.task):
- y_pred = y_pred[target]
-
- fig.add_trace(
- self._draw_line(
- x=self._get_plot_index(y_pred),
- y=y_pred,
- mode="lines+markers",
- parent=m.name,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- if plot_interval:
- try:
- y_pred = m.predict_interval(fh, X)
- except NotImplementedError:
- continue # Fails for some models like ES
-
- if is_multioutput(self.task):
- # Select interval of target column for multivariate
- y = y_pred.iloc[:, y_pred.columns.get_loc(target)]
- else:
- y = y_pred # Univariate
-
- fig.add_traces(
- [
- go.Scatter(
- x=self._get_plot_index(y_pred),
- y=y.iloc[:, 1],
- mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(m.name)),
- hovertemplate=f"%{{y}}{m.name} - upper bound",
- legendgroup=m.name,
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- ),
- go.Scatter(
- x=self._get_plot_index(y_pred),
- y=y.iloc[:, 0],
- mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(m.name)),
- fill="tonexty",
- fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)",
- hovertemplate=f"%{{y}}{m.name} - lower bound",
- legendgroup=m.name,
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- ]
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- groupclick="togglegroup" if plot_interval else "toggleitem",
- xlabel=self.y.index.name,
- ylabel=target,
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_forecast",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task(["binary", "multilabel"]))
- @composed(crash, plot_from_model)
- def plot_gains(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str | SEQUENCE = "test",
- target: INT | str = 0,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the cumulative gains curve.
-
- This plot is available only for binary and [multilabel][]
- classification tasks.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str or sequence, default="test"
- Data set on which to calculate the metric. Use a sequence
- or add `+` between options to select more than one. Choose
- from: "train", "test" or "holdout".
-
- target: int or str, default=0
- Target column to look at. Only for [multilabel][] tasks.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_det
- atom.plots:PredictionPlot.plot_lift
- atom.plots:PredictionPlot.plot_roc
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["LR", "RF"])
- atom.plot_gains()
- ```
-
- """
- dataset = self._get_set(dataset, max_one=False)
- target = self.branch._get_target(target, only_columns=True)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- for ds in dataset:
- y_true, y_pred = m._get_pred(ds, target, attr="thresh")
-
- fig.add_trace(
- self._draw_line(
- x=np.arange(start=1, stop=len(y_true) + 1) / len(y_true),
- y=np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum(),
- mode="lines",
- parent=m.name,
- child=ds,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis)
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Fraction of sample",
- ylabel="Gain",
- xlim=(0, 1),
- ylim=(0, 1.02),
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_gains",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(ensembles=False))
- def plot_learning_curve(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- metric: INT | str | SEQUENCE | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the learning curve: score vs number of training samples.
-
- This plot is available only for models fitted using
- [train sizing][]. [Ensembles][] are ignored.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- metric: int, str, sequence or None, default=None
- Metric to plot (only for multi-metric runs). Use a sequence
- or add `+` between options to select more than one. If None,
- the metric used to run the pipeline is selected.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_results
- atom.plots:PredictionPlot.plot_successive_halving
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.train_sizing(["LR", "RF"], n_bootstrap=5)
- atom.plot_learning_curve()
- ```
-
- """
- metric = self._get_metric(metric, max_one=False)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- for met in metric:
- x, y, std = defaultdict(list), defaultdict(list), defaultdict(list)
- for m in models:
- x[m._group].append(m._train_idx)
- y[m._group].append(get_best_score(m, met))
- if m.bootstrap is not None:
- std[m._group].append(m.bootstrap.iloc[:, met].std())
-
- for group in x:
- fig.add_trace(
- self._draw_line(
- x=x[group],
- y=y[group],
- mode="lines+markers",
- marker_symbol="circle",
- error_y=dict(type="data", array=std[group], visible=True),
- parent=group,
- child=self._metric[met].name,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- # Add error bands
- if m.bootstrap is not None:
- fillcolor = f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)"
- fig.add_traces(
- [
- go.Scatter(
- x=x[group],
- y=np.add(y[group], std[group]),
- mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(group)),
- hovertemplate="%{y}upper bound",
- legendgroup=group,
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- ),
- go.Scatter(
- x=x[group],
- y=np.subtract(y[group], std[group]),
- mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(group)),
- fill="tonexty",
- fillcolor=fillcolor,
- hovertemplate="%{y}lower bound",
- legendgroup=group,
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- ),
- ]
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- groupclick="togglegroup",
- title=title,
- legend=legend,
- xlabel="Number of training samples",
- ylabel="Score",
- figsize=figsize,
- plotname="plot_learning_curve",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task(["binary", "multilabel"]))
- @composed(crash, plot_from_model)
- def plot_lift(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str | SEQUENCE = "test",
- target: INT | str = 0,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the lift curve.
-
- Only available for binary classification tasks.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str or sequence, default="test"
- Data set on which to calculate the metric. Use a sequence
- or add `+` between options to select more than one. Choose
- from: "train", "test" or "holdout".
-
- target: int or str, default=0
- Target column to look at. Only for [multilabel][] tasks.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_det
- atom.plots:PredictionPlot.plot_gains
- atom.plots:PredictionPlot.plot_prc
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["LR", "RF"])
- atom.plot_lift()
- ```
-
- """
- dataset = self._get_set(dataset, max_one=False)
- target = self.branch._get_target(target, only_columns=True)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- for ds in dataset:
- y_true, y_pred = m._get_pred(ds, target, attr="thresh")
-
- gains = np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum()
- fig.add_trace(
- self._draw_line(
- x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)),
- y=gains / x,
- mode="lines",
- parent=m.name,
- child=ds,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- self._draw_straight_line(y=1, xaxis=xaxis, yaxis=yaxis)
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Fraction of sample",
- ylabel="Lift",
- xlim=(0, 1),
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_lift",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model)
- def plot_parshap(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- columns: SLICE | None = None,
- target: INT | str | tuple = 1,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper left",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the partial correlation of shap values.
-
- Plots the train and test correlation between the shap value of
- every feature with its target value, after removing the effect
- of all other features (partial correlation). This plot is
- useful to identify the features that are contributing most to
- overfitting. Features that lie below the bisector (diagonal
- line) performed worse on the test set than on the training set.
- If the estimator has a `scores_`, `feature_importances_` or
- `coef_` attribute, its normalized values are shown in a color
- map.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- columns: int, str, slice, sequence or None, default=None
- Features to plot. If None, it plots all features.
-
- target: int, str or tuple, default=1
- Class in the target column to target. For multioutput tasks,
- the value should be a tuple of the form (column, class).
- Note that for binary and multilabel tasks, the selected
- class is always the positive one.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper left"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_feature_importance
- atom.plots:PredictionPlot.plot_partial_dependence
- atom.plots:PredictionPlot.plot_permutation_importance
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["GNB", "RF"])
- atom.rf.plot_parshap(legend=None)
- atom.plot_parshap(columns=slice(5, 10))
- ```
-
- """
- target = self.branch._get_target(target)
-
- fig = self._get_figure()
-
- # Colorbar is only needed when a model has feature_importance
- if all(m.feature_importance is None for m in models):
- xaxis, yaxis = BasePlot._fig.get_axes()
- else:
- xaxis, yaxis = BasePlot._fig.get_axes(
- x=(0, 0.87),
- coloraxis=dict(
- colorscale="Reds",
- title="Normalized feature importance",
- font_size=self.label_fontsize,
- )
- )
-
- for m in models:
- parshap = {}
- fxs = m.branch._get_columns(columns, include_target=False)
-
- for ds in ("train", "test"):
- # Calculating shap values is computationally expensive,
- # therefore select a random subsample for large data sets
- if len(data := getattr(m, ds)) > 500:
- data = data.sample(500, random_state=self.random_state)
-
- # Replace data with the calculated shap values
- explanation = m._shap.get_explanation(data[m.features], target)
- data[m.features] = explanation.values
-
- parshap[ds] = pd.Series(index=fxs, dtype=float)
- for fx in fxs:
- # All other features are covariates
- covariates = [f for f in data.columns[:-1] if f != fx]
- cols = [fx, data.columns[-1], *covariates]
-
- # Compute covariance
- V = data[cols].cov()
-
- # Inverse covariance matrix
- Vi = np.linalg.pinv(V, hermitian=True)
- diag = Vi.diagonal()
-
- D = np.diag(np.sqrt(1 / diag))
-
- # Partial correlation matrix
- partial_corr = -1 * (D @ Vi @ D) # @ is matrix multiplication
-
- # Semi-partial correlation matrix
- with np.errstate(divide="ignore"):
- V_sqrt = np.sqrt(np.diag(V))[..., None]
- Vi_sqrt = np.sqrt(np.abs(diag - Vi ** 2 / diag[..., None])).T
- semi_partial_correlation = partial_corr / V_sqrt / Vi_sqrt
-
- # X covariates are removed
- parshap[ds][fx] = semi_partial_correlation[1, 0]
-
- # Get the feature importance or coefficients
- if m.feature_importance is not None:
- color = m.feature_importance.loc[fxs]
- else:
- color = BasePlot._fig.get_elem("parshap")
-
- fig.add_trace(
- go.Scatter(
- x=parshap["train"],
- y=parshap["test"],
- mode="markers+text",
- marker=dict(
- color=color,
- size=self.marker_size,
- coloraxis=f"coloraxis{xaxis[1:]}",
- line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
- ),
- text=m.features,
- textposition="top center",
- customdata=(data := None if isinstance(color, str) else list(color)),
- hovertemplate=(
- f"%{{text}}
(%{{x}}, %{{y}})"
- f"{'
Feature importance: %{customdata:.4f}' if data else ''}"
- f"{m.name}"
- ),
- name=m.name,
- legendgroup=m.name,
- showlegend=BasePlot._fig.showlegend(m.name, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis)
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Training set",
- ylabel="Test set",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_parshap",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model)
- def plot_partial_dependence(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- columns: SLICE | None = None,
- kind: str | SEQUENCE = "average",
- pair: int | str | None = None,
- target: INT | str = 1,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the partial dependence of features.
-
- The partial dependence of a feature (or a set of features)
- corresponds to the response of the model for each possible
- value of the feature. The plot can take two forms:
-
- - If `pair` is None: Single feature partial dependence lines.
- The deciles of the feature values are shown with tick marks
- on the bottom.
- - If `pair` is defined: Two-way partial dependence plots are
- plotted as contour plots (only allowed for a single model).
-
- Read more about partial dependence on sklearn's
- [documentation][partial_dependence]. This plot is not available
- for multilabel nor multiclass-multioutput classification tasks.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- columns: int, str, slice, sequence or None, default=None
- Features to get the partial dependence from. If None, it
- uses the first 3 features in the dataset.
-
- kind: str or sequence, default="average"
- Kind of depedence to plot. Use a sequence or add `+` between
- options to select more than one. Choose from:
-
- - "average": Partial dependence averaged across all samples
- in the dataset.
- - "individual": Partial dependence for up to 50 random
- samples (Individual Conditional Expectation).
-
- This parameter is ignored when plotting feature pairs.
-
- pair: int, str or None, default=None
- Feature with which to pair the features selected by
- `columns`. If specified, the resulting figure displays
- contour plots. Only allowed when plotting a single model.
- If None, the plots show the partial dependece of single
- features.
-
- target: int or str, default=1
- Class in the target column to look at (only for multiclass
- classification tasks).
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_feature_importance
- atom.plots:PredictionPlot.plot_parshap
- atom.plots:PredictionPlot.plot_permutation_importance
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["LR", "RF"])
- atom.plot_partial_dependence(kind="average+individual", legend="upper left")
- atom.rf.plot_partial_dependence(columns=(3, 4), pair=2)
- ```
-
- """
- if any(self.task.startswith(t) for t in ("multilabel", "multiclass-multioutput")):
- raise PermissionError(
- "The plot_partial_dependence method is not available for multilabel "
- f"nor multiclass-multioutput classification tasks, got {self.task}."
- )
- elif self.task.startswith("multiclass"):
- _, target = self.branch._get_target(target)
- else:
- target = 0
-
- kind = "+".join(lst(kind)).lower()
- if any(k not in ("average", "individual") for k in kind.split("+")):
- raise ValueError(
- f"Invalid value for the kind parameter, got {kind}. "
- "Choose from: average, individual."
- )
-
- axes, names = [], []
- fig = self._get_figure()
- for m in models:
- color = BasePlot._fig.get_elem(m.name)
-
- # Since every model can have different fxs, select them
- # every time and make sure the models use the same fxs
- cols = m.branch._get_columns(
- columns=(0, 1, 2) if columns is None else columns,
- include_target=False,
- )
-
- if not names:
- names = cols
- elif names != cols:
- raise ValueError(
- "Invalid value for the columns parameter. Not all "
- f"models use the same features, got {names} and {cols}."
- )
-
- if pair is not None:
- if len(models) > 1:
- raise ValueError(
- f"Invalid value for the pair parameter, got {pair}. "
- "The value must be None when plotting multiple models"
- )
- else:
- pair = m.branch._get_columns(pair, include_target=False)
- cols = [(c, pair[0]) for c in cols]
- else:
- cols = [(c,) for c in cols]
-
- # Create new axes
- if not axes:
- for i, col in enumerate(cols):
- # Calculate the distance between subplots
- offset = divide(0.025, len(cols) - 1)
-
- # Calculate the size of the subplot
- size = (1 - ((offset * 2) * (len(cols) - 1))) / len(cols)
-
- # Determine the position for the axes
- x_pos = i % len(cols) * (size + 2 * offset)
-
- xaxis, yaxis = BasePlot._fig.get_axes(x=(x_pos, rnd(x_pos + size)))
- axes.append((xaxis, yaxis))
-
- # Compute averaged predictions
- predictions = Parallel(n_jobs=self.n_jobs, backend=self.backend)(
- delayed(partial_dependence)(
- estimator=m.estimator,
- X=m.X_test,
- features=col,
- kind="both" if "individual" in kind else "average",
- ) for col in cols
- )
-
- # Compute deciles for ticks (only if line plots)
- if len(cols[0]) == 1:
- deciles = {}
- for fx in chain.from_iterable(cols):
- if fx not in deciles: # Skip if the feature is repeated
- X_col = _safe_indexing(m.X_test, fx, axis=1)
- deciles[fx] = mquantiles(X_col, prob=np.arange(0.1, 1.0, 0.1))
-
- for i, (ax, fx, pred) in enumerate(zip(axes, cols, predictions)):
- # Draw line or contour plot
- if len(pred["values"]) == 1:
- # For both average and individual: draw ticks on the horizontal axis
- for line in deciles[fx[0]]:
- fig.add_shape(
- type="line",
- x0=line,
- x1=line,
- xref=ax[0],
- y0=0,
- y1=0.05,
- yref=f"{axes[0][1]} domain",
- line=dict(width=1, color=BasePlot._fig.get_elem(m.name)),
- opacity=0.6,
- layer="below",
- )
-
- # Draw the mean of the individual lines
- if "average" in kind:
- fig.add_trace(
- go.Scatter(
- x=pred["values"][0],
- y=pred["average"][target].ravel(),
- mode="lines",
- line=dict(width=2, color=color),
- name=m.name,
- legendgroup=m.name,
- showlegend=BasePlot._fig.showlegend(m.name, legend),
- xaxis=ax[0],
- yaxis=axes[0][1],
- )
- )
-
- # Draw all individual (per sample) lines (ICE)
- if "individual" in kind:
- # Select up to 50 random samples to plot
- idx = np.random.choice(
- list(range(len(pred["individual"][target]))),
- size=min(len(pred["individual"][target]), 50),
- replace=False,
- )
- for sample in pred["individual"][target, idx, :]:
- fig.add_trace(
- go.Scatter(
- x=pred["values"][0],
- y=sample,
- mode="lines",
- line=dict(width=0.5, color=color),
- name=m.name,
- legendgroup=m.name,
- showlegend=BasePlot._fig.showlegend(m.name, legend),
- xaxis=ax[0],
- yaxis=axes[0][1],
- )
- )
-
- else:
- colorscale = PALETTE.get(BasePlot._fig.get_elem(m.name), "Teal")
- fig.add_trace(
- go.Contour(
- x=pred["values"][0],
- y=pred["values"][1],
- z=pred["average"][target],
- contours=dict(
- showlabels=True,
- labelfont=dict(size=self.tick_fontsize, color="white")
- ),
- hovertemplate="x:%{x}
y:%{y}
z:%{z}",
- hoverongaps=False,
- colorscale=colorscale,
- showscale=False,
- showlegend=False,
- xaxis=ax[0],
- yaxis=axes[0][1],
- )
- )
-
- self._plot(
- ax=(f"xaxis{ax[0][1:]}", f"yaxis{ax[1][1:]}"),
- xlabel=fx[0],
- ylabel=(fx[1] if len(fx) > 1 else "Score") if i == 0 else None,
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- groupclick="togglegroup",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_partial_dependence",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model)
- def plot_permutation_importance(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- show: INT | None = None,
- n_repeats: INT = 10,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the feature permutation importance of models.
-
- !!! warning
- This method can be slow. Results are cached to fasten
- repeated calls.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- show: int or None, default=None
- Number of features (ordered by importance) to show. If
- None, it shows all features.
-
- n_repeats: int, default=10
- Number of times to permute each feature.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of features shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_feature_importance
- atom.plots:PredictionPlot.plot_partial_dependence
- atom.plots:PredictionPlot.plot_parshap
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["LR", "RF"])
- atom.plot_permutation_importance(show=10, n_repeats=7)
- ```
-
- """
- show = self._get_show(show, models)
-
- if n_repeats <= 0:
- raise ValueError(
- "Invalid value for the n_repeats parameter."
- f"Value should be >0, got {n_repeats}."
- )
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- for m in models:
- # Permutation importances returns Bunch object
- permutations = self._memory.cache(permutation_importance)(
- estimator=m.estimator,
- X=m.X_test,
- y=m.y_test,
- scoring=self._metric[0],
- n_repeats=n_repeats,
- n_jobs=self.n_jobs,
- random_state=self.random_state,
- )
-
- fig.add_trace(
- go.Box(
- x=permutations["importances"].ravel(),
- y=list(np.array([[fx] * n_repeats for fx in m.features]).ravel()),
- marker_color=BasePlot._fig.get_elem(m.name),
- boxpoints="outliers",
- orientation="h",
- name=m.name,
- legendgroup=m.name,
- showlegend=BasePlot._fig.showlegend(m.name, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout(
- {
- f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
- "boxmode": "group",
- }
- )
-
- # Unique number of features over all branches
- n_fxs = len(set([fx for m in models for fx in m.features]))
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Score",
- ylim=(n_fxs - show - 0.5, n_fxs - 0.5),
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- plotname="plot_permutation_importance",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(check_fitted=False))
- def plot_pipeline(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- draw_hyperparameter_tuning: bool = True,
- color_branches: bool | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> plt.Figure | None:
- """Plot a diagram of the pipeline.
-
- !!! warning
- This plot uses the [schemdraw][] package, which is
- incompatible with [plotly][]. The returned plot is
- therefore a [matplotlib figure][pltfigure].
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models for which to draw the pipeline. If None, all
- pipelines are plotted.
-
- draw_hyperparameter_tuning: bool, default=True
- Whether to draw if the models used Hyperparameter Tuning.
-
- color_branches: bool or None, default=None
- Whether to draw every branch in a different color. If None,
- branches are colored when there is more than one.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the pipeline drawn.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as png. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [plt.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:DataPlot.plot_wordcloud
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["GNB", "RNN", "SGD", "MLP"])
- atom.voting(models=atom.winners[:2])
- atom.plot_pipeline()
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.scale()
- atom.prune()
- atom.run("RF", n_trials=30)
-
- atom.branch = "undersample"
- atom.balance("nearmiss")
- atom.run("RF_undersample")
-
- atom.branch = "oversample_from_master"
- atom.balance("smote")
- atom.run("RF_oversample")
-
- atom.plot_pipeline()
- ```
-
- """
-
- def get_length(pl, i):
- """Get the maximum length of the name of a block."""
- if len(pl) > i:
- return max(len(pl[i].__class__.__name__) * 0.5, 7)
- else:
- return 0
-
- def check_y(xy):
- """Return y unless there is something right, then jump."""
- while any(pos[0] > xy[0] and pos[1] == xy[1] for pos in positions.values()):
- xy = Point((xy[0], xy[1] + height))
-
- return xy[1]
-
- def add_wire(x, y):
- """Draw a connecting wire between two estimators."""
- d.add(
- Wire(shape="z", k=(x - d.here[0]) / (length + 1), arrow="->")
- .to((x, y))
- .color(branch["color"])
- )
-
- # Update arrowhead manually
- d.elements[-1].segments[-1].arrowwidth = 0.3
- d.elements[-1].segments[-1].arrowlength = 0.5
-
- check_dependency("schemdraw")
- from schemdraw import Drawing
- from schemdraw.flow import Data, RoundBox, Subroutine, Wire
- from schemdraw.util import Point
-
- fig = self._get_figure(backend="matplotlib")
- check_canvas(BasePlot._fig.is_canvas, "plot_pipeline")
-
- # Define branches to plot (if called from model, it's only one)
- branches = []
- for branch in getattr(self, "_branches", [self.branch]):
- draw_models, draw_ensembles = [], []
- for m in models:
- if m.branch is branch:
- if m.acronym not in ("Stack", "Vote"):
- draw_models.append(m)
- else:
- draw_ensembles.append(m)
-
- # Additionally, add all dependent models (if not already there)
- draw_models.extend([i for i in m._models if i not in draw_models])
-
- if not models or draw_models:
- branches.append(
- {
- "name": branch.name,
- "pipeline": list(branch.pipeline),
- "models": draw_models,
- "ensembles": draw_ensembles,
- }
- )
-
- # Define colors per branch
- for branch in branches:
- if color_branches or (color_branches is None and len(branches) > 1):
- color = next(BasePlot._fig.palette)
-
- # Convert back to format accepted by matplotlib
- branch["color"] = unconvert_from_RGB_255(unlabel_rgb(color))
- else:
- branch["color"] = "black"
-
- # Create schematic drawing
- d = Drawing(unit=1, backend="matplotlib")
- d.config(fontsize=self.tick_fontsize)
- d.add(Subroutine(w=8, s=0.7).label("Raw data"))
-
- height = 3 # Height of every block
- length = 5 # Minimum arrow length
-
- # Define the x-position for every block
- x_pos = [d.here[0] + length]
- for i in range(max(len(b["pipeline"]) for b in branches)):
- len_block = reduce(max, [get_length(b["pipeline"], i) for b in branches])
- x_pos.append(x_pos[-1] + length + len_block)
-
- # Add positions for scaling, hyperparameter tuning and models
- x_pos.extend([x_pos[-1], x_pos[-1]])
- if any(m.scaler for m in models):
- x_pos[-1] = x_pos[-2] = x_pos[-3] + length + 7
- if draw_hyperparameter_tuning and any(m.trials is not None for m in models):
- x_pos[-1] = x_pos[-2] + length + 11
-
- positions = {0: d.here} # Contains the position of every element
- for branch in branches:
- d.here = positions[0]
-
- for i, est in enumerate(branch["pipeline"]):
- # If the estimator has already been seen, don't draw
- if id(est) in positions:
- # Change location to estimator's end
- d.here = positions[id(est)]
- continue
-
- # Draw transformer
- add_wire(x_pos[i], check_y(d.here))
- d.add(
- RoundBox(w=max(len(est.__class__.__name__) * 0.5, 7))
- .label(est.__class__.__name__, color="k")
- .color(branch["color"])
- .anchor("W")
- .drop("E")
- )
-
- positions[id(est)] = d.here
-
- for model in branch["models"]:
- # Position at last transformer or at start
- if branch["pipeline"]:
- d.here = positions[id(est)]
- else:
- d.here = positions[0]
-
- # For a single branch, center models
- if len(branches) == 1:
- offset = height * (len(branch["models"]) - 1) / 2
- else:
- offset = 0
-
- # Draw automated feature scaling
- if model.scaler:
- add_wire(x_pos[-3], check_y((d.here[0], d.here[1] - offset)))
- d.add(
- RoundBox(w=7)
- .label("Scaler", color="k")
- .color(branch["color"])
- .drop("E")
- )
- offset = 0
-
- # Draw hyperparameter tuning
- if draw_hyperparameter_tuning and model.trials is not None:
- add_wire(x_pos[-2], check_y((d.here[0], d.here[1] - offset)))
- d.add(
- Data(w=11)
- .label("Hyperparameter\nTuning", color="k")
- .color(branch["color"])
- .drop("E")
- )
- offset = 0
-
- # Remove classifier/regressor from model's name
- name = model.estimator.__class__.__name__
- if name.lower().endswith("classifier"):
- name = name[:-10]
- elif name.lower().endswith("regressor"):
- name = name[:-9]
-
- # Draw model
- add_wire(x_pos[-1], check_y((d.here[0], d.here[1] - offset)))
- d.add(
- Data(w=max(len(name) * 0.5, 7))
- .label(name, color="k")
- .color(branch["color"])
- .anchor("W")
- .drop("E")
- )
-
- positions[id(model)] = d.here
-
- # Draw ensembles
- max_pos = max(pos[0] for pos in positions.values()) # Max length model names
- for branch in branches:
- for model in branch["ensembles"]:
- # Determine y-position of the ensemble
- y_pos = [positions[id(m)][1] for m in model._models]
- offset = height / 2 * (len(branch["ensembles"]) - 1)
- y = min(y_pos) + (max(y_pos) - min(y_pos)) * 0.5 - offset
- y = check_y((max_pos + length, max(min(y_pos), y)))
-
- d.here = (max_pos + length, y)
-
- d.add(
- Data(w=max(len(model._fullname) * 0.5, 7))
- .label(model._fullname, color="k")
- .color(branch["color"])
- .anchor("W")
- .drop("E")
- )
-
- positions[id(model)] = d.here
-
- # Draw a wire from every model to the ensemble
- for m in model._models:
- d.here = positions[id(m)]
- add_wire(max_pos + length, y)
-
- if not figsize:
- dpi, bbox = fig.get_dpi(), d.get_bbox()
- figsize = (dpi * bbox.xmax // 4, (dpi / 2) * (bbox.ymax - bbox.ymin))
-
- d.draw(canvas=plt.gca(), showframe=False, show=False)
- plt.axis("off")
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=plt.gca(),
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_pipeline",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task(["binary", "multilabel"]))
- @composed(crash, plot_from_model)
- def plot_prc(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str | SEQUENCE = "test",
- target: INT | str = 0,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower left",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the precision-recall curve.
-
- Read more about [PRC][] in sklearn's documentation. Only
- available for binary classification tasks.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str or sequence, default="test"
- Data set on which to calculate the metric. Use a sequence
- or add `+` between options to select more than one. Choose
- from: "train", "test" or "holdout".
-
- target: int or str, default=0
- Target column to look at. Only for [multilabel][] tasks.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower left"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_det
- atom.plots:PredictionPlot.plot_lift
- atom.plots:PredictionPlot.plot_roc
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["LR", "RF"])
- atom.plot_prc()
- ```
-
- """
- dataset = self._get_set(dataset, max_one=False)
- target = self.branch._get_target(target, only_columns=True)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- for ds in dataset:
- y_true, y_pred = m._get_pred(ds, target, attr="thresh")
-
- # Get precision-recall pairs for different thresholds
- prec, rec, _ = precision_recall_curve(y_true, y_pred)
-
- fig.add_trace(
- self._draw_line(
- x=rec,
- y=prec,
- mode="lines",
- parent=m.name,
- child=ds,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- self._draw_straight_line(sum(m.y_test) / len(m.y_test), xaxis=xaxis, yaxis=yaxis)
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Recall",
- ylabel="Precision",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_prc",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task("class"))
- @composed(crash, plot_from_model)
- def plot_probabilities(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str = "test",
- target: INT | str | tuple = 1,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the probability distribution of the target classes.
-
- This plot is available only for models with a `predict_proba`
- method in classification tasks.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str, default="test"
- Data set on which to calculate the metric. Choose from:
- "train", "test" or "holdout".
-
- target: int, str or tuple, default=1
- Probability of being that class in the target column. For
- multioutput tasks, the value should be a tuple of the form
- (column, class).
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_confusion_matrix
- atom.plots:PredictionPlot.plot_results
- atom.plots:PredictionPlot.plot_threshold
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["LR", "RF"])
- atom.plot_probabilities()
- ```
-
- """
- check_predict_proba(models, "plot_probabilities")
- ds = self._get_set(dataset, max_one=True)
- col, cls = self.branch._get_target(target)
- col = lst(self.target)[col]
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- y_true, y_pred = getattr(m, f"y_{ds}"), getattr(m, f"predict_proba_{ds}")
- for value in np.unique(m.dataset[col]):
- # Get indices per class
- if is_multioutput(self.task):
- if self.task.startswith("multilabel"):
- hist = y_pred.loc[y_true[col] == value, col]
- else:
- hist = y_pred.loc[cls, col].loc[y_true[col] == value]
- else:
- hist = y_pred.loc[y_true == value, str(cls)]
-
- fig.add_trace(
- go.Scatter(
- x=(x := np.linspace(0, 1, 100)),
- y=stats.gaussian_kde(hist)(x),
- mode="lines",
- line=dict(
- width=2,
- color=BasePlot._fig.get_elem(m.name),
- dash=BasePlot._fig.get_elem(ds, "dash"),
- ),
- fill="tonexty",
- fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)",
- fillpattern=dict(shape=BasePlot._fig.get_elem(value, "shape")),
- name=f"{col}={value}",
- legendgroup=m.name,
- legendgrouptitle=dict(text=m.name, font_size=self.label_fontsize),
- showlegend=BasePlot._fig.showlegend(f"{m.name}-{value}", legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- groupclick="toggleitem",
- xlabel="Probability",
- ylabel="Probability density",
- xlim=(0, 1),
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_probabilities",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task("reg"))
- @composed(crash, plot_from_model)
- def plot_residuals(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str = "test",
- target: INT | str = 0,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "upper left",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot a model's residuals.
-
- The plot shows the residuals (difference between the predicted
- and the true value) on the vertical axis and the independent
- variable on the horizontal axis. The gray, intersected line
- shows the identity line. This plot can be useful to analyze the
- variance of the error of the regressor. If the points are
- randomly dispersed around the horizontal axis, a linear
- regression model is appropriate for the data; otherwise, a
- non-linear model is more appropriate. This plot is only
- available for regression tasks.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str, default="test"
- Data set on which to calculate the metric. Choose from:
- "train", "test" or "holdout".
-
- target: int or str, default=0
- Target column to look at. Only for [multioutput tasks][].
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="upper left"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_errors
-
- Examples
- --------
- ```pycon
- from atom import ATOMRegressor
- from sklearn.datasets import load_diabetes
-
- X, y = load_diabetes(return_X_y=True, as_frame=True)
-
- atom = ATOMRegressor(X, y)
- atom.run(["OLS", "LGB"])
- atom.plot_residuals()
- ```
-
- """
- ds = self._get_set(dataset, max_one=True)
- target = self.branch._get_target(target, only_columns=True)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes(x=(0, 0.69))
- xaxis2, yaxis2 = BasePlot._fig.get_axes(x=(0.71, 1.0))
- for m in models:
- y_true, y_pred = m._get_pred(ds, target)
-
- fig.add_trace(
- go.Scatter(
- x=y_true,
- y=(res := np.subtract(y_true, y_pred)),
- mode="markers",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- name=m.name,
- legendgroup=m.name,
- showlegend=BasePlot._fig.showlegend(m.name, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.add_trace(
- go.Histogram(
- y=res,
- bingroup="residuals",
- marker=dict(
- color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
- line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
- ),
- name=m.name,
- legendgroup=m.name,
- showlegend=False,
- xaxis=xaxis2,
- yaxis=yaxis,
- )
- )
-
- self._draw_straight_line(y=0, xaxis=xaxis, yaxis=yaxis)
-
- fig.update_layout({f"yaxis{xaxis[1:]}_showgrid": True, "barmode": "overlay"})
-
- self._plot(
- ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"),
- xlabel="Distribution",
- title=title,
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- groupclick="togglegroup",
- ylabel="Residuals",
- xlabel="True value",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_residuals",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model)
- def plot_results(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- metric: INT | str | SEQUENCE | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the model results.
-
- If all models applied bootstrap, the plot is a boxplot. If
- not, the plot is a barplot. Models are ordered based on
- their score from the top down. The score is either the
- `score_bootstrap` or `score_test` attribute of the model,
- selected in that order.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- metric: int, str, sequence or None, default=None
- Metric to plot (only for multi-metric runs). Other available
- options are "time_bo", "time_fit", "time_bootstrap" and
- "time". If str, add `+` between options to select more than
- one. If None, the metric used to run the pipeline is selected.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of models.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_confusion_matrix
- atom.plots:PredictionPlot.plot_probabilities
- atom.plots:PredictionPlot.plot_threshold
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["GNB", "LR", "RF", "LGB"], metric=["f1", "recall"])
- atom.plot_results()
-
- atom.run(["GNB", "LR", "RF", "LGB"], metric=["f1", "recall"], n_bootstrap=5)
- atom.plot_results()
- atom.plot_results(metric="time_fit+time")
- ```
-
- """
-
- def get_std(model: MODEL, metric: int) -> SCALAR:
- """Get the standard deviation of the bootstrap scores.
-
- Parameters
- ----------
- model: Model
- Model to get the std from.
-
- metric: int
- Index of the metric to get it from.
-
- Returns
- -------
- int or float
- Standard deviation score or 0 if not bootstrapped.
-
- """
- if model.bootstrap is None:
- return 0
- else:
- return model.bootstrap.iloc[:, metric].std()
-
- metric = self._get_metric(metric, max_one=False)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- for met in metric:
- if isinstance(met, str):
- color = BasePlot._fig.get_elem(met)
- fig.add_trace(
- go.Bar(
- x=[getattr(m, met) for m in models],
- y=[m.name for m in models],
- orientation="h",
- marker=dict(
- color=f"rgba({color[4:-1]}, 0.2)",
- line=dict(width=2, color=color),
- ),
- hovertemplate=f"%{{x}}{met}",
- name=met,
- legendgroup=met,
- showlegend=BasePlot._fig.showlegend(met, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
- else:
- name = self._metric[met].name
- color = BasePlot._fig.get_elem()
-
- if all(m.score_bootstrap for m in models):
- x = np.array([m.bootstrap.iloc[:, met] for m in models]).ravel()
- y = np.array([[m.name] * len(m.bootstrap) for m in models]).ravel()
- fig.add_trace(
- go.Box(
- x=x,
- y=list(y),
- marker_color=color,
- boxpoints="outliers",
- orientation="h",
- name=name,
- legendgroup=name,
- showlegend=BasePlot._fig.showlegend(name, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
- else:
- fig.add_trace(
- go.Bar(
- x=[get_best_score(m, met) for m in models],
- y=[m.name for m in models],
- error_x=dict(
- type="data",
- array=[get_std(m, met) for m in models],
- ),
- orientation="h",
- marker=dict(
- color=f"rgba({color[4:-1]}, 0.2)",
- line=dict(width=2, color=color),
- ),
- hovertemplate="%{x}",
- name=name,
- legendgroup=name,
- showlegend=BasePlot._fig.showlegend(name, legend),
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- fig.update_layout(
- {
- f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
- "bargroupgap": 0.05,
- "boxmode": "group",
- }
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="time (s)" if all(isinstance(m, str) for m in metric) else "Score",
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + len(models) * 50),
- plotname="plot_results",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task(["binary", "multilabel"]))
- @composed(crash, plot_from_model)
- def plot_roc(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- dataset: str | SEQUENCE = "test",
- target: INT | str = 0,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot the Receiver Operating Characteristics curve.
-
- Read more about [ROC][] in sklearn's documentation. Only
- available for classification tasks.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- dataset: str or sequence, default="test"
- Data set on which to calculate the metric. Use a sequence
- or add `+` between options to select more than one. Choose
- from: "train", "test" or "holdout".
-
- target: int or str, default=0
- Target column to look at. Only for [multilabel][] tasks.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_gains
- atom.plots:PredictionPlot.plot_lift
- atom.plots:PredictionPlot.plot_prc
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["LR", "RF"])
- atom.plot_roc()
- ```
-
- """
- dataset = self._get_set(dataset, max_one=False)
- target = self.branch._get_target(target, only_columns=True)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
- for m in models:
- for ds in dataset:
- # Get False (True) Positive Rate as arrays
- fpr, tpr, _ = roc_curve(*m._get_pred(ds, target, attr="thresh"))
-
- fig.add_trace(
- self._draw_line(
- x=fpr,
- y=tpr,
- mode="lines",
- parent=m.name,
- child=ds,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis)
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlim=(-0.03, 1.03),
- ylim=(-0.03, 1.03),
- xlabel="FPR",
- ylabel="TPR",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_roc",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(ensembles=False))
- def plot_successive_halving(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- metric: INT | str | SEQUENCE | None = None,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower right",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot scores per iteration of the successive halving.
-
- Only use with models fitted using [successive halving][].
- [Ensembles][] are ignored.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- metric: int, str, sequence or None, default=None
- Metric to plot (only for multi-metric runs). Use a sequence
- or add `+` between options to select more than one. If None,
- the metric used to run the pipeline is selected.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower right"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_learning_curve
- atom.plots:PredictionPlot.plot_results
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.successive_halving(["Tree", "Bag", "RF", "LGB"], n_bootstrap=5)
- atom.plot_successive_halving()
- ```
-
- """
- metric = self._get_metric(metric, max_one=False)
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- for met in metric:
- x, y, std = defaultdict(list), defaultdict(list), defaultdict(list)
- for m in models:
- x[m._group].append(len(m.branch._idx[1]) // m._train_idx)
- y[m._group].append(get_best_score(m, met))
- if m.bootstrap is not None:
- std[m._group].append(m.bootstrap.iloc[:, met].std())
-
- for group in x:
- fig.add_trace(
- self._draw_line(
- x=x[group],
- y=y[group],
- mode="lines+markers",
- marker_symbol="circle",
- error_y=dict(type="data", array=std[group], visible=True),
- parent=group,
- child=self._metric[met].name,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- # Add error bands
- if m.bootstrap is not None:
- fillcolor = f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)"
- fig.add_traces(
- [
- go.Scatter(
- x=x[group],
- y=np.add(y[group], std[group]),
- mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(group)),
- hovertemplate="%{y}upper bound",
- legendgroup=group,
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- ),
- go.Scatter(
- x=x[group],
- y=np.subtract(y[group], std[group]),
- mode="lines",
- line=dict(width=1, color=BasePlot._fig.get_elem(group)),
- fill="tonexty",
- fillcolor=fillcolor,
- hovertemplate="%{y}lower bound",
- legendgroup=group,
- showlegend=False,
- xaxis=xaxis,
- yaxis=yaxis,
- ),
- ]
- )
-
- fig.update_layout({f"xaxis{yaxis[1:]}": dict(dtick=1, autorange="reversed")})
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- groupclick="togglegroup",
- title=title,
- legend=legend,
- xlabel="n_models",
- ylabel="Score",
- figsize=figsize,
- plotname="plot_successive_halving",
- filename=filename,
- display=display,
- )
-
- @available_if(has_task(["binary", "multilabel"]))
- @composed(crash, plot_from_model)
- def plot_threshold(
- self,
- models: INT | str | MODEL | slice | SEQUENCE | None = None,
- metric: METRIC_SELECTOR = None,
- dataset: str = "test",
- target: INT | str = 0,
- steps: INT = 100,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = "lower left",
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> go.Figure | None:
- """Plot metric performances against threshold values.
-
- This plot is available only for models with a `predict_proba`
- method in a binary or [multilabel][] classification task.
-
- Parameters
- ----------
- models: int, str, Model, slice, sequence or None, default=None
- Models to plot. If None, all models are selected.
-
- metric: str, func, scorer, sequence or None, default=None
- Metric to plot. Choose from any of sklearn's scorers, a
- function with signature `metric(y_true, y_pred)`, a scorer
- object or a sequence of these. Use a sequence or add `+`
- between options to select more than one. If None, the
- metric used to run the pipeline is selected.
-
- dataset: str, default="test"
- Data set on which to calculate the metric. Choose from:
- "train", "test" or "holdout".
-
- target: int or str, default=0
- Target column to look at. Only for [multilabel][] tasks.
-
- steps: int, default=100
- Number of thresholds measured.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default="lower left"
- Legend for the plot. See the [user guide][parameters] for
- an extended description of the choices.
-
- - If None: No legend is shown.
- - If str: Location where to show the legend.
- - If dict: Legend configuration.
-
- figsize: tuple, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as html. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [go.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_calibration
- atom.plots:PredictionPlot.plot_confusion_matrix
- atom.plots:PredictionPlot.plot_probabilities
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import make_classification
-
- X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run(["LR", "RF"])
- atom.plot_threshold()
- ```
-
- """
- check_predict_proba(models, "plot_threshold")
- ds = self._get_set(dataset, max_one=True)
- target = self.branch._get_target(target, only_columns=True)
-
- # Get all metric functions from the input
- if metric is None:
- metrics = [m._score_func for m in self._metric]
- else:
- metrics = []
- for m in lst(metric):
- if isinstance(m, str):
- metrics.extend(m.split("+"))
- else:
- metrics.append(m)
- metrics = [get_custom_scorer(m)._score_func for m in metrics]
-
- fig = self._get_figure()
- xaxis, yaxis = BasePlot._fig.get_axes()
-
- steps = np.linspace(0, 1, steps)
- for m in models:
- y_true, y_pred = m._get_pred(ds, target, attr="predict_proba")
- for met in metrics:
- fig.add_trace(
- self._draw_line(
- x=steps,
- y=[met(y_true, y_pred >= step) for step in steps],
- parent=m.name,
- child=met.__name__,
- legend=legend,
- xaxis=xaxis,
- yaxis=yaxis,
- )
- )
-
- BasePlot._fig.used_models.extend(models)
- return self._plot(
- ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
- xlabel="Threshold",
- ylabel="Score",
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_threshold",
- filename=filename,
- display=display,
- )
-
-
-@typechecked
-class ShapPlot(BasePlot):
- """Shap plots.
-
- ATOM wrapper for plots made by the shap package, using Shapley
- values for model interpretation. These plots are accessible from
- the runners or from the models. Only one model can be plotted at
- the same time since the plots are not made by ATOM.
-
- """
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_shap_bar(
- self,
- models: INT | str | MODEL | None = None,
- index: SLICE | None = None,
- show: INT | None = None,
- target: INT | str | tuple = 1,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> plt.Figure | None:
- """Plot SHAP's bar plot.
-
- Create a bar plot of a set of SHAP values. If a single sample
- is passed, then the SHAP values are plotted. If many samples
- are passed, then the mean absolute value for each feature
- column is plotted. Read more about SHAP plots in the
- [user guide][shap].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_shap_bar()`.
-
- index: int, str, slice, sequence or None, default=None
- Rows in the dataset to plot. If None, it selects all rows
- in the test set.
-
- show: int or None, default=None
- Number of features (ordered by importance) to show. If
- None, it shows all features.
-
- target: int, str or tuple, default=1
- Class in the target column to target. For multioutput tasks,
- the value should be a tuple of the form (column, class).
- Note that for binary and multilabel tasks, the selected
- class is always the positive one.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of features shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as png. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [plt.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_parshap
- atom.plots:ShapPlot.plot_shap_beeswarm
- atom.plots:ShapPlot.plot_shap_scatter
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run("LR")
- atom.plot_shap_bar(show=10)
- ```
-
- """
- rows = models.X.loc[models.branch._get_rows(index)]
- show = self._get_show(show, models)
- target = self.branch._get_target(target)
- explanation = models._shap.get_explanation(rows, target)
-
- self._get_figure(backend="matplotlib")
- check_canvas(BasePlot._fig.is_canvas, "plot_shap_bar")
-
- shap.plots.bar(explanation, max_display=show, show=False)
-
- BasePlot._fig.used_models.append(models)
- return self._plot(
- ax=plt.gca(),
- xlabel=plt.gca().get_xlabel(),
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- plotname="plot_shap_bar",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_shap_beeswarm(
- self,
- models: INT | str | MODEL | None = None,
- index: slice | SEQUENCE | None = None,
- show: INT | None = None,
- target: INT | str | tuple = 1,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> plt.Figure | None:
- """Plot SHAP's beeswarm plot.
-
- The plot is colored by feature values. Read more about SHAP
- plots in the [user guide][shap].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_shap_beeswarm()`.
-
- index: tuple, slice or None, default=None
- Rows in the dataset to plot. If None, it selects all rows
- in the test set. The beeswarm plot does not support plotting
- a single sample.
-
- show: int or None, default=None
- Number of features (ordered by importance) to show. If
- None, it shows all features.
-
- target: int, str or tuple, default=1
- Class in the target column to target. For multioutput tasks,
- the value should be a tuple of the form (column, class).
- Note that for binary and multilabel tasks, the selected
- class is always the positive one.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of features shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as png. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [plt.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:PredictionPlot.plot_parshap
- atom.plots:ShapPlot.plot_shap_bar
- atom.plots:ShapPlot.plot_shap_scatter
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run("LR")
- atom.plot_shap_beeswarm(show=10)
- ```
-
- """
- rows = models.X.loc[models.branch._get_rows(index)]
- show = self._get_show(show, models)
- target = self.branch._get_target(target)
- explanation = models._shap.get_explanation(rows, target)
-
- self._get_figure(backend="matplotlib")
- check_canvas(BasePlot._fig.is_canvas, "plot_shap_beeswarm")
-
- shap.plots.beeswarm(explanation, max_display=show, show=False)
-
- BasePlot._fig.used_models.append(models)
- return self._plot(
- ax=plt.gca(),
- xlabel=plt.gca().get_xlabel(),
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_shap_decision(
- self,
- models: INT | str | MODEL | None = None,
- index: SLICE | None = None,
- show: INT | None = None,
- target: INT | str | tuple = 1,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> plt.Figure | None:
- """Plot SHAP's decision plot.
-
- Visualize model decisions using cumulative SHAP values. Each
- plotted line explains a single model prediction. If a single
- prediction is plotted, feature values are printed in the
- plot (if supplied). If multiple predictions are plotted
- together, feature values will not be printed. Plotting too
- many predictions together will make the plot unintelligible.
- Read more about SHAP plots in the [user guide][shap].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_shap_decision()`.
-
- index: int, str, slice, sequence or None, default=None
- Rows in the dataset to plot. If None, it selects all rows
- in the test set.
-
- show: int or None, default=None
- Number of features (ordered by importance) to show. If
- None, it shows all features.
-
- target: int, str or tuple, default=1
- Class in the target column to target. For multioutput tasks,
- the value should be a tuple of the form (column, class).
- Note that for binary and multilabel tasks, the selected
- class is always the positive one.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of features shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as png. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [plt.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:ShapPlot.plot_shap_bar
- atom.plots:ShapPlot.plot_shap_beeswarm
- atom.plots:ShapPlot.plot_shap_force
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run("LR")
- atom.plot_shap_decision(show=10)
- atom.plot_shap_decision(index=-1, show=10)
- ```
-
- """
- rows = models.X.loc[models.branch._get_rows(index)]
- show = self._get_show(show, models)
- target = self.branch._get_target(target)
- explanation = models._shap.get_explanation(rows, target)
-
- self._get_figure(backend="matplotlib")
- check_canvas(BasePlot._fig.is_canvas, "plot_shap_decision")
-
- shap.decision_plot(
- base_value=explanation.base_values,
- shap_values=explanation.values,
- features=rows,
- feature_display_range=slice(-1, -show - 1, -1),
- auto_size_plot=False,
- show=False,
- )
-
- BasePlot._fig.used_models.append(models)
- return self._plot(
- ax=plt.gca(),
- xlabel=plt.gca().get_xlabel(),
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- plotname="plot_shap_decision",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_shap_force(
- self,
- models: INT | str | MODEL | None = None,
- index: SLICE | None = None,
- target: INT | str | tuple = 1,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] = (900, 300),
- filename: str | None = None,
- display: bool | None = True,
- **kwargs,
- ) -> plt.Figure | None:
- """Plot SHAP's force plot.
-
- Visualize the given SHAP values with an additive force layout.
- Note that by default this plot will render using javascript.
- For a regular figure use `matplotlib=True` (this option is
- only available when only a single sample is plotted). Read more
- about SHAP plots in the [user guide][shap].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_shap_force()`.
-
- index: int, str, slice, sequence or None, default=None
- Rows in the dataset to plot. If None, it selects all rows
- in the test set.
-
- target: int, str or tuple, default=1
- Class in the target column to target. For multioutput tasks,
- the value should be a tuple of the form (column, class).
- Note that for binary and multilabel tasks, the selected
- class is always the positive one.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=(900, 300)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as png. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- **kwargs
- Additional keyword arguments for [shap.plots.force][force].
-
- Returns
- -------
- [plt.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:ShapPlot.plot_shap_beeswarm
- atom.plots:ShapPlot.plot_shap_scatter
- atom.plots:ShapPlot.plot_shap_decision
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run("LR")
- atom.plot_shap_force(index=-2, matplotlib=True, figsize=(1800, 300))
- ```
-
- """
- rows = models.X.loc[models.branch._get_rows(index)]
- target = self.branch._get_target(target)
- explanation = models._shap.get_explanation(rows, target)
-
- self._get_figure(create_figure=False, backend="matplotlib")
- check_canvas(BasePlot._fig.is_canvas, "plot_shap_force")
-
- plot = shap.force_plot(
- base_value=explanation.base_values,
- shap_values=explanation.values,
- features=rows,
- show=False,
- **kwargs,
- )
-
- if kwargs.get("matplotlib"):
- BasePlot._fig.used_models.append(models)
- return self._plot(
- fig=plt.gcf(),
- ax=plt.gca(),
- title=title,
- legend=legend,
- figsize=figsize,
- plotname="plot_shap_force",
- filename=filename,
- display=display,
- )
- else:
- if filename: # Save to a html file
- if not filename.endswith(".html"):
- filename += ".html"
- shap.save_html(filename, plot)
- if display and find_spec("IPython"):
- from IPython.display import display
-
- shap.initjs()
- display(plot)
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_shap_heatmap(
- self,
- models: INT | str | MODEL | None = None,
- index: slice | SEQUENCE | None = None,
- show: INT | None = None,
- target: INT | str | tuple = 1,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> plt.Figure | None:
- """Plot SHAP's heatmap plot.
-
- This plot is designed to show the population substructure of a
- dataset using supervised clustering and a heatmap. Supervised
- clustering involves clustering data points not by their original
- feature values but by their explanations. Read more about SHAP
- plots in the [user guide][shap].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_shap_heatmap()`.
-
- index: slice, sequence or None, default=None
- Rows in the dataset to plot. If None, it selects all rows
- in the test set. The plot_shap_heatmap method does not
- support plotting a single sample.
-
- show: int or None, default=None
- Number of features (ordered by importance) to show. If
- None, it shows all features.
-
- target: int, str or tuple, default=1
- Class in the target column to target. For multioutput tasks,
- the value should be a tuple of the form (column, class).
- Note that for binary and multilabel tasks, the selected
- class is always the positive one.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of features shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as png. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [plt.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:ShapPlot.plot_shap_decision
- atom.plots:ShapPlot.plot_shap_force
- atom.plots:ShapPlot.plot_shap_waterfall
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run("LR")
- atom.plot_shap_heatmap(show=10)
- ```
-
- """
- rows = models.X.loc[models.branch._get_rows(index)]
- show = self._get_show(show, models)
- target = self.branch._get_target(target)
- explanation = models._shap.get_explanation(rows, target)
-
- self._get_figure(backend="matplotlib")
- check_canvas(BasePlot._fig.is_canvas, "plot_shap_heatmap")
-
- shap.plots.heatmap(explanation, max_display=show, show=False)
-
- BasePlot._fig.used_models.append(models)
- return self._plot(
- ax=plt.gca(),
- xlabel=plt.gca().get_xlabel(),
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- plotname="plot_shap_heatmap",
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_shap_scatter(
- self,
- models: INT | str | MODEL | None = None,
- index: slice | SEQUENCE | None = None,
- columns: INT | str = 0,
- target: INT | str | tuple = 1,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] = (900, 600),
- filename: str | None = None,
- display: bool | None = True,
- ) -> plt.Figure | None:
- """Plot SHAP's scatter plot.
-
- Plots the value of the feature on the x-axis and the SHAP value
- of the same feature on the y-axis. This shows how the model
- depends on the given feature, and is like a richer extension of
- the classical partial dependence plots. Vertical dispersion of
- the data points represents interaction effects. Read more about
- SHAP plots in the [user guide][shap].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_shap_scatter()`.
-
- index: slice, sequence or None, default=None
- Rows in the dataset to plot. If None, it selects all rows
- in the test set. The plot_shap_scatter method does not
- support plotting a single sample.
-
- columns: int or str, default=0
- Column to plot.
-
- target: int, str or tuple, default=1
- Class in the target column to target. For multioutput tasks,
- the value should be a tuple of the form (column, class).
- Note that for binary and multilabel tasks, the selected
- class is always the positive one.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=(900, 600)
- Figure's size in pixels, format as (x, y).
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as png. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [plt.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:ShapPlot.plot_shap_beeswarm
- atom.plots:ShapPlot.plot_shap_decision
- atom.plots:ShapPlot.plot_shap_force
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run("LR")
- atom.plot_shap_scatter(columns="symmetry error")
- ```
-
- """
- rows = models.X.loc[models.branch._get_rows(index)]
- column = models.branch._get_columns(columns, include_target=False)[0]
- target = self.branch._get_target(target)
- explanation = models._shap.get_explanation(rows, target)
-
- # Get explanation for a specific column
- explanation = explanation[:, models.columns.get_loc(column)]
-
- self._get_figure(backend="matplotlib")
- check_canvas(BasePlot._fig.is_canvas, "plot_shap_scatter")
-
- shap.plots.scatter(explanation, color=explanation, ax=plt.gca(), show=False)
-
- BasePlot._fig.used_models.append(models)
- return self._plot(
- ax=plt.gca(),
- xlabel=plt.gca().get_xlabel(),
- ylabel=plt.gca().get_ylabel(),
- title=title,
- legend=legend,
- plotname="plot_shap_scatter",
- figsize=figsize,
- filename=filename,
- display=display,
- )
-
- @composed(crash, plot_from_model(max_one=True))
- def plot_shap_waterfall(
- self,
- models: INT | str | MODEL | None = None,
- index: INT | str | None = None,
- show: INT | None = None,
- target: INT | str | tuple = 1,
- *,
- title: str | dict | None = None,
- legend: str | dict | None = None,
- figsize: tuple[INT, INT] | None = None,
- filename: str | None = None,
- display: bool | None = True,
- ) -> plt.Figure | None:
- """Plot SHAP's waterfall plot.
-
- The SHAP value of a feature represents the impact of the
- evidence provided by that feature on the model’s output. The
- waterfall plot is designed to visually display how the SHAP
- values (evidence) of each feature move the model output from
- our prior expectation under the background data distribution,
- to the final model prediction given the evidence of all the
- features. Features are sorted by the magnitude of their SHAP
- values with the smallest magnitude features grouped together
- at the bottom of the plot when the number of features in the
- models exceeds the `show` parameter. Read more about SHAP plots
- in the [user guide][shap].
-
- Parameters
- ----------
- models: int, str, Model or None, default=None
- Model to plot. If None, all models are selected. Note that
- leaving the default option could raise an exception if there
- are multiple models. To avoid this, call the plot directly
- from a model, e.g. `atom.lr.plot_shap_waterfall()`.
-
- index: int, str or None, default=None
- Rows in the dataset to plot. If None, it selects all rows
- in the test set. The plot_shap_waterfall method does not
- support plotting multiple samples.
-
- show: int or None, default=None
- Number of features (ordered by importance) to show. If
- None, it shows all features.
-
- target: int, str or tuple, default=1
- Class in the target column to target. For multioutput tasks,
- the value should be a tuple of the form (column, class).
- Note that for binary and multilabel tasks, the selected
- class is always the positive one.
-
- title: str, dict or None, default=None
- Title for the plot.
-
- - If None, no title is shown.
- - If str, text for the title.
- - If dict, [title configuration][parameters].
-
- legend: str, dict or None, default=None
- Does nothing. Implemented for continuity of the API.
-
- figsize: tuple or None, default=None
- Figure's size in pixels, format as (x, y). If None, it
- adapts the size to the number of features shown.
-
- filename: str or None, default=None
- Save the plot using this name. Use "auto" for automatic
- naming. The type of the file depends on the provided name
- (.html, .png, .pdf, etc...). If `filename` has no file type,
- the plot is saved as png. If None, the plot is not saved.
-
- display: bool or None, default=True
- Whether to render the plot. If None, it returns the figure.
-
- Returns
- -------
- [plt.Figure][] or None
- Plot object. Only returned if `display=None`.
-
- See Also
- --------
- atom.plots:ShapPlot.plot_shap_bar
- atom.plots:ShapPlot.plot_shap_beeswarm
- atom.plots:ShapPlot.plot_shap_heatmap
-
- Examples
- --------
- ```pycon
- from atom import ATOMClassifier
- from sklearn.datasets import load_breast_cancer
-
- X, y = load_breast_cancer(return_X_y=True, as_frame=True)
-
- atom = ATOMClassifier(X, y, random_state=1)
- atom.run("LR")
- atom.plot_shap_waterfall(show=10)
- ```
-
- """
- rows = models.X.loc[[models.branch._get_rows(index)[0]]]
- show = self._get_show(show, models)
- target = self.branch._get_target(target)
- explanation = models._shap.get_explanation(rows, target)
-
- # Waterfall accepts only one row
- explanation.values = explanation.values[0]
- explanation.data = explanation.data[0]
-
- self._get_figure(backend="matplotlib")
- check_canvas(BasePlot._fig.is_canvas, "plot_shap_waterfall")
-
- shap.plots.waterfall(explanation, max_display=show, show=False)
-
- BasePlot._fig.used_models.append(models)
- return self._plot(
- ax=plt.gca(),
- title=title,
- legend=legend,
- figsize=figsize or (900, 400 + show * 50),
- plotname="plot_shap_waterfall",
- filename=filename,
- display=display,
- )
diff --git a/atom/plots/__init__.py b/atom/plots/__init__.py
new file mode 100644
index 000000000..765a2ac2e
--- /dev/null
+++ b/atom/plots/__init__.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module for plots.
+
+"""
+
+from atom.plots.dataplot import DataPlot
+from atom.plots.featureselectionplot import FeatureSelectionPlot
+from atom.plots.hyperparametertuningplot import HyperparameterTuningPlot
+from atom.plots.predictionplot import PredictionPlot
+from atom.plots.shapplot import ShapPlot
+
+
+class ATOMPlot(
+ FeatureSelectionPlot,
+ DataPlot,
+ HyperparameterTuningPlot,
+ PredictionPlot,
+ ShapPlot,
+):
+ """Plot classes inherited by main ATOM classes."""
+ pass
+
+
+class RunnerPlot(HyperparameterTuningPlot, PredictionPlot, ShapPlot):
+ """Plot classes inherited by the runners and callable from models."""
+ pass
diff --git a/atom/plots/base.py b/atom/plots/base.py
new file mode 100644
index 000000000..7028c6ce9
--- /dev/null
+++ b/atom/plots/base.py
@@ -0,0 +1,1117 @@
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module containing the base classes for plotting.
+
+"""
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from dataclasses import dataclass
+from itertools import cycle
+from typing import Literal
+
+import matplotlib.pyplot as plt
+import plotly.express as px
+import plotly.graph_objects as go
+from mlflow.tracking import MlflowClient
+from typeguard import typechecked
+
+from atom.utils.constants import PALETTE
+from atom.utils.types import (
+ BOOL, DATAFRAME, FLOAT, INDEX, INT, INT_TYPES, LEGEND, MODEL, SCALAR,
+ SEQUENCE,
+)
+from atom.utils.utils import (
+ composed, crash, divide, get_custom_scorer, lst, rnd, to_rgb,
+)
+
+
+@dataclass
+class Aesthetics:
+ """Keeps track of plot aesthetics."""
+
+ palette: SEQUENCE # Sequence of colors
+ title_fontsize: INT # Fontsize for titles
+ label_fontsize: INT # Fontsize for labels, legend and hoverinfo
+ tick_fontsize: INT # Fontsize for ticks
+ line_width: INT # Width of the line plots
+ marker_size: INT # Size of the markers
+
+
+@typechecked
+class BaseFigure:
+ """Base plotly figure.
+
+ The instance stores the position of the current axes in grid,
+ as well as the models used for the plot (to track in mlflow).
+
+ Parameters
+ ----------
+ rows: int, default=1
+ Number of subplot rows in the canvas.
+
+ cols: int, default=1
+ Number of subplot columns in the canvas.
+
+ horizontal_spacing: float, default=0.05
+ Space between subplot rows in normalized plot coordinates.
+ The spacing is relative to the figure's size.
+
+ vertical_spacing: float, default=0.07
+ Space between subplot cols in normalized plot coordinates.
+ The spacing is relative to the figure's size.
+
+ palette: str or sequence, default="Prism"
+ Name or color sequence for the palette.
+
+ is_canvas: bool, default=False
+ Whether the figure shows multiple plots.
+
+ backend: str, default="plotly"
+ Figure's backend. Choose between plotly or matplotlib.
+
+ create_figure: bool, default=True
+ Whether to create a new figure.
+
+ """
+
+ _marker = ["circle", "x", "diamond", "pentagon", "star", "hexagon"]
+ _dash = [None, "dashdot", "dash", "dot", "longdash", "longdashdot"]
+ _shape = ["", "/", "x", "\\", "-", "|", "+", "."]
+
+ def __init__(
+ self,
+ rows: INT = 1,
+ cols: INT = 1,
+ horizontal_spacing: FLOAT = 0.05,
+ vertical_spacing: FLOAT = 0.07,
+ palette: str | SEQUENCE = "Prism",
+ is_canvas: BOOL = False,
+ backend: Literal["plotly", "matplotlib"] = "plotly",
+ create_figure: BOOL = True,
+ ):
+ self.rows = rows
+ self.cols = cols
+ self.horizontal_spacing = horizontal_spacing
+ self.vertical_spacing = vertical_spacing
+ if isinstance(palette, str):
+ self._palette = getattr(px.colors.qualitative, palette)
+ self.palette = cycle(self._palette)
+ else:
+ # Convert color names or hex to rgb
+ self._palette = list(map(to_rgb, palette))
+ self.palette = cycle(self._palette)
+ self.is_canvas = is_canvas
+ self.backend = backend
+ self.create_figure = create_figure
+
+ self.idx = 0 # N-th plot in the canvas
+ self.axes = 0 # N-th axis in the canvas
+ if self.create_figure:
+ if self.backend == "plotly":
+ self.figure = go.Figure()
+ else:
+ self.figure, _ = plt.subplots()
+
+ self.groups = []
+ self.style = dict(palette={}, marker={}, dash={}, shape={})
+ self.marker = cycle(self._marker)
+ self.dash = cycle(self._dash)
+ self.shape = cycle(self._shape)
+
+ self.pos = {} # Subplot position to use for title
+ self.custom_layout = {} # Layout params specified by user
+ self.used_models = [] # Models plotted in this figure
+
+ # Perform parameter checks
+ if not 0 < horizontal_spacing < 1:
+ raise ValueError(
+ "Invalid value for the horizontal_spacing parameter. The "
+ f"value must lie between 0 and 1, got {horizontal_spacing}."
+ )
+
+ if not 0 < vertical_spacing < 1:
+ raise ValueError(
+ "Invalid value for the vertical_spacing parameter. The "
+ f"value must lie between 0 and 1, got {vertical_spacing}."
+ )
+
+ @property
+ def grid(self) -> tuple[INT, INT]:
+ """Position of the current axes on the grid.
+
+ Returns
+ -------
+ int
+ X-position.
+
+ int
+ Y-position.
+
+ """
+ return (self.idx - 1) // self.cols + 1, self.idx % self.cols or self.cols
+
+ @property
+ def next_subplot(self) -> go.Figure | plt.Figure | None:
+ """Increase the subplot index.
+
+ Returns
+ -------
+ go.Figure, plt.Figure or None
+ Current figure. Returns None if `create_figure=False`.
+
+ """
+ # Check if there are too many plots in the canvas
+ if self.idx >= self.rows * self.cols:
+ raise ValueError(
+ "Invalid number of plots in the canvas! Increase "
+ "the number of rows and cols to add more plots."
+ )
+ else:
+ self.idx += 1
+
+ if self.create_figure:
+ return self.figure
+
+ def get_elem(
+ self,
+ name: SCALAR | str | None = None,
+ element: Literal["palette", "marker", "dash", "shape"] = "palette",
+ ) -> str | None:
+ """Get the plot element for a specific name.
+
+ This method is used to assign the same element (color, marker,
+ etc...) to the same columns and models in a plot.
+
+ Parameters
+ ----------
+ name: int, float or str or None, default=None
+ Name for which to get the plot element. The name is stored in
+ the element attributes to assign the same element to all calls
+ with the same name. If None, return the first element.
+
+ element: str, default="palette"
+ Plot element to get. Choose from: palette, marker, dash, shape.
+
+ Returns
+ -------
+ str or None
+ Element code.
+
+ """
+ if name is None:
+ return getattr(self, f"_{element}")[0] # Get first element (default)
+ elif name in self.style[element]:
+ return self.style[element][name]
+ else:
+ return self.style[element].setdefault(name, next(getattr(self, element)))
+
+ def showlegend(self, name: str, legend: LEGEND | dict | None) -> BOOL:
+ """Get whether the trace should be showed in the legend.
+
+ If there's already a trace with the same name, it's not
+ necessary to show it in the plot's legend.
+
+ Parameters
+ ----------
+ name: str
+ Name of the trace.
+
+ legend: str, dict or None
+ Legend parameter.
+
+ Returns
+ -------
+ bool
+ Whether the trace should be placed in the legend.
+
+ """
+ if name in self.groups:
+ return False
+ else:
+ self.groups.append(name)
+ return legend is not None
+
+ def get_axes(
+ self,
+ x: tuple[SCALAR, SCALAR] = (0, 1),
+ y: tuple[SCALAR, SCALAR] = (0, 1),
+ coloraxis: dict | None = None,
+ ) -> tuple[str, str]:
+ """Create and update the plot's axes.
+
+ Parameters
+ ----------
+ x: tuple
+ Relative x-size of the plot.
+
+ y: tuple
+ Relative y-size of the plot.
+
+ coloraxis: dict or None
+ Properties of the coloraxis to create. None to ignore.
+
+ Returns
+ -------
+ str
+ Name of the x-axis.
+
+ str
+ Name of the y-axis.
+
+ """
+ self.axes += 1
+
+ # Calculate the distance between subplots
+ x_offset = divide(self.horizontal_spacing, (self.cols - 1))
+ y_offset = divide(self.vertical_spacing, (self.rows - 1))
+
+ # Calculate the size of the subplot
+ x_size = (1 - ((x_offset * 2) * (self.cols - 1))) / self.cols
+ y_size = (1 - ((y_offset * 2) * (self.rows - 1))) / self.rows
+
+ # Calculate the size of the axes
+ ax_size = (x[1] - x[0]) * x_size
+ ay_size = (y[1] - y[0]) * y_size
+
+ # Determine the position for the axes
+ x_pos = (self.grid[1] - 1) * (x_size + 2 * x_offset) + x[0] * x_size
+ y_pos = (self.rows - self.grid[0]) * (y_size + 2 * y_offset) + y[0] * y_size
+
+ # Store positions for subplot title
+ self.pos[str(self.axes)] = (x_pos + ax_size / 2, rnd(y_pos + ay_size))
+
+ # Update the figure with the new axes
+ self.figure.update_layout(
+ {
+ f"xaxis{self.axes}": dict(
+ domain=(x_pos, rnd(x_pos + ax_size)), anchor=f"y{self.axes}"
+ ),
+ f"yaxis{self.axes}": dict(
+ domain=(y_pos, rnd(y_pos + ay_size)), anchor=f"x{self.axes}"
+ ),
+ }
+ )
+
+ # Place a colorbar right of the axes
+ if coloraxis:
+ if title := coloraxis.pop("title", None):
+ coloraxis["colorbar_title"] = dict(
+ text=title, side="right", font_size=coloraxis.pop("font_size")
+ )
+
+ coloraxis["colorbar_x"] = rnd(x_pos + ax_size) + ax_size / 40
+ coloraxis["colorbar_xanchor"] = "left"
+ coloraxis["colorbar_y"] = y_pos + ay_size / 2
+ coloraxis["colorbar_yanchor"] = "middle"
+ coloraxis["colorbar_len"] = ay_size * 0.9
+ coloraxis["colorbar_thickness"] = ax_size * 30 # Default width in pixels
+ self.figure.update_layout(
+ {f"coloraxis{coloraxis.pop('axes', self.axes)}": coloraxis}
+ )
+
+ xaxis = f"x{self.axes if self.axes > 1 else ''}"
+ yaxis = f"y{self.axes if self.axes > 1 else ''}"
+ return xaxis, yaxis
+
+
+@typechecked
+class BasePlot:
+ """Base class for all plotting methods.
+
+ This base class defines the properties that can be changed
+ to customize the plot's aesthetics.
+
+ """
+
+ _fig = None
+ _custom_layout = {}
+ _custom_traces = {}
+ _aesthetics = Aesthetics(
+ palette=list(PALETTE),
+ title_fontsize=24,
+ label_fontsize=16,
+ tick_fontsize=12,
+ line_width=2,
+ marker_size=8,
+ )
+
+ # Properties =================================================== >>
+
+ @property
+ def aesthetics(self) -> Aesthetics:
+ """All plot aesthetic attributes."""
+ return self._aesthetics
+
+ @aesthetics.setter
+ def aesthetics(self, value: dict):
+ self.palette = value.get("palette", self.palette)
+ self.title_fontsize = value.get("title_fontsize", self.title_fontsize)
+ self.label_fontsize = value.get("label_fontsize", self.label_fontsize)
+ self.tick_fontsize = value.get("tick_fontsize", self.tick_fontsize)
+ self.line_width = value.get("line_width", self.line_width)
+ self.marker_size = value.get("marker_size", self.marker_size)
+
+ @property
+ def palette(self) -> str | SEQUENCE:
+ """Color palette.
+
+ Specify one of plotly's [built-in palettes][palette] or create
+ a custom one, e.g. `atom.palette = ["red", "green", "blue"]`.
+
+ """
+ return self._aesthetics.palette
+
+ @palette.setter
+ def palette(self, value: str | SEQUENCE):
+ if isinstance(value, str) and not hasattr(px.colors.qualitative, value):
+ raise ValueError(
+ f"Invalid value for the palette parameter, got {value}. Choose "
+ f"from one of plotly's built-in qualitative color sequences in "
+ f"the px.colors.qualitative module or define your own sequence."
+ )
+
+ self._aesthetics.palette = value
+
+ @property
+ def title_fontsize(self) -> INT:
+ """Fontsize for the plot's title."""
+ return self._aesthetics.title_fontsize
+
+ @title_fontsize.setter
+ def title_fontsize(self, value: INT):
+ if value <= 0:
+ raise ValueError(
+ "Invalid value for the title_fontsize parameter. "
+ f"Value should be >=0, got {value}."
+ )
+
+ self._aesthetics.title_fontsize = value
+
+ @property
+ def label_fontsize(self) -> INT:
+ """Fontsize for the labels, legend and hover information."""
+ return self._aesthetics.label_fontsize
+
+ @label_fontsize.setter
+ def label_fontsize(self, value: INT):
+ if value <= 0:
+ raise ValueError(
+ "Invalid value for the label_fontsize parameter. "
+ f"Value should be >=0, got {value}."
+ )
+
+ self._aesthetics.label_fontsize = value
+
+ @property
+ def tick_fontsize(self) -> INT:
+ """Fontsize for the ticks along the plot's axes."""
+ return self._aesthetics.tick_fontsize
+
+ @tick_fontsize.setter
+ def tick_fontsize(self, value: INT):
+ if value <= 0:
+ raise ValueError(
+ "Invalid value for the tick_fontsize parameter. "
+ f"Value should be >=0, got {value}."
+ )
+
+ self._aesthetics.tick_fontsize = value
+
+ @property
+ def line_width(self) -> INT:
+ """Width of the line plots."""
+ return self._aesthetics.line_width
+
+ @line_width.setter
+ def line_width(self, value: INT):
+ if value <= 0:
+ raise ValueError(
+ "Invalid value for the line_width parameter. "
+ f"Value should be >=0, got {value}."
+ )
+
+ self._aesthetics.line_width = value
+
+ @property
+ def marker_size(self) -> INT:
+ """Size of the markers."""
+ return self._aesthetics.marker_size
+
+ @marker_size.setter
+ def marker_size(self, value: INT):
+ if value <= 0:
+ raise ValueError(
+ "Invalid value for the marker_size parameter. "
+ f"Value should be >=0, got {value}."
+ )
+
+ self._aesthetics.marker_size = value
+
+ # Methods ====================================================== >>
+
+ @staticmethod
+ def _get_plot_index(df: DATAFRAME) -> INDEX:
+ """Return the dataset's index in a plottable format.
+
+ Plotly does not accept all index formats (e.g. pd.Period),
+ thus use this utility method to convert to timestamp those
+ indices that can, else return as is.
+
+ Parameters
+ ----------
+ df: dataframe
+ Data set to get the index from.
+
+ Returns
+ -------
+ index
+ Index in an acceptable format.
+
+ """
+ if hasattr(df.index, "to_timestamp"):
+ return df.index.to_timestamp()
+ else:
+ return df.index
+
+ @staticmethod
+ def _get_show(show: INT | None, model: MODEL | list[MODEL]) -> INT:
+ """Check and return the number of features to show.
+
+ Parameters
+ ----------
+ show: int or None
+ Number of features to show. If None, select all (max 200).
+
+ model: Model or list
+ Models from which to get the features.
+
+ Returns
+ -------
+ int
+ Number of features to show.
+
+ """
+ max_fxs = max(m.n_features for m in lst(model))
+ if show is None or show > max_fxs:
+ # Limit max features shown to avoid maximum figsize error
+ show = min(200, max_fxs)
+ elif show < 1:
+ raise ValueError(
+ f"Invalid value for the show parameter. Value should be >0, got {show}."
+ )
+
+ return show
+
+ @staticmethod
+ def _get_hyperparams(
+ params: str | slice | SEQUENCE | None,
+ model: MODEL,
+ ) -> list[str]:
+ """Check and return a model's hyperparameters.
+
+ Parameters
+ ----------
+ params: str, slice, sequence or None
+ Hyperparameters to get. Use a sequence or add `+` between
+ options to select more than one. If None, all the model's
+ hyperparameters are selcted.
+
+ model: Model
+ Get the params from this model.
+
+ Returns
+ -------
+ list of str
+ Selected hyperparameters.
+
+ """
+ if params is None:
+ hyperparameters = list(model._ht["distributions"])
+ elif isinstance(params, slice):
+ hyperparameters = list(model._ht["distributions"])[params]
+ else:
+ hyperparameters = []
+ for param in lst(params):
+ if isinstance(param, INT_TYPES):
+ hyperparameters.append(list(model._ht["distributions"])[param])
+ elif isinstance(param, str):
+ for p in param.split("+"):
+ if p not in model._ht["distributions"]:
+ raise ValueError(
+ "Invalid value for the params parameter. "
+ f"Hyperparameter {p} was not used during the "
+ f"optimization of model {model.name}."
+ )
+ else:
+ hyperparameters.append(p)
+
+ if not hyperparameters:
+ raise ValueError(f"Didn't find any hyperparameters for model {model.name}.")
+
+ return hyperparameters
+
+ def _get_metric(
+ self,
+ metric: INT | str | SEQUENCE | None,
+ max_one: BOOL,
+ ) -> INT | str | list[INT | str]:
+ """Check and return the provided metric index.
+
+ Parameters
+ ----------
+ metric: int, str, sequence or None
+ Metric to retrieve. If None, all metrics are returned.
+
+ max_one: bool
+ Whether one or multiple metrics are allowed.
+
+ Returns
+ -------
+ int or list
+ Position index of the metric. If `max_one=False`, returns
+ a list of metric positions.
+
+ """
+ if metric is None:
+ return list(range(len(self._metric)))
+ else:
+ inc = []
+ for met in lst(metric):
+ if isinstance(met, INT_TYPES):
+ if 0 <= met < len(self._metric):
+ inc.append(met)
+ else:
+ raise ValueError(
+ f"Invalid value for the metric parameter. Value {met} is out "
+ f"of range for a pipeline with {len(self._metric)} metrics."
+ )
+ elif isinstance(met, str):
+ met = met.lower()
+ for m in met.split("+"):
+ if m in ("time_ht", "time_fit", "time_bootstrap", "time"):
+ inc.append(m)
+ elif (name := get_custom_scorer(m).name) in self.metric:
+ inc.append(self._metric.index(name))
+ else:
+ raise ValueError(
+ "Invalid value for the metric parameter. The "
+ f"{name} metric wasn't used to fit the models."
+ )
+
+ if len(inc) > 1 and max_one:
+ raise ValueError(
+ "Invalid value for the metric parameter. "
+ f"Only one metric is allowed, got {inc}."
+ )
+
+ return inc[0] if max_one else inc
+
+ def _get_set(
+ self,
+ dataset: str | SEQUENCE,
+ max_one: BOOL,
+ allow_holdout: BOOL = True,
+ ) -> str | list[str]:
+ """Check and return the provided data set.
+
+ Parameters
+ ----------
+ dataset: str or sequence
+ Name(s) of the data set to retrieve.
+
+ max_one: bool
+ Whether one or multiple data sets are allowed. If True, return
+ the data set instead of a list.
+
+ allow_holdout: bool, default=True
+ Whether to allow the retrieval of the holdout set.
+
+ Returns
+ -------
+ str or list
+ Selected data set(s).
+
+ """
+ for ds in (dataset := "+".join(lst(dataset)).lower().split("+")):
+ if ds == "holdout":
+ if allow_holdout:
+ if self.holdout is None:
+ raise ValueError(
+ "Invalid value for the dataset parameter. No holdout "
+ "data set was specified when initializing the instance."
+ )
+ else:
+ raise ValueError(
+ "Invalid value for the dataset parameter, got "
+ f"{ds}. Choose from: train, test."
+ )
+ elif ds not in ("train", "test"):
+ raise ValueError(
+ "Invalid value for the dataset parameter, got {ds}. "
+ f"Choose from: train, test{', holdout' if allow_holdout else ''}."
+ )
+
+ if max_one and len(dataset) > 1:
+ raise ValueError(
+ "Invalid value for the dataset parameter, got "
+ f"{dataset}. Only one data set is allowed."
+ )
+
+ return dataset[0] if max_one else dataset
+
+ def _get_figure(self, **kwargs) -> go.Figure | plt.Figure | None:
+ """Return existing figure if in canvas, else a new figure.
+
+ Every time this method is called from a canvas, the plot
+ index is raised by one to keep track in which subplot the
+ BaseFigure is at.
+
+ Parameters
+ ----------
+ **kwargs
+ Additional keyword arguments for BaseFigure.
+
+ Returns
+ -------
+ [go.Figure][], [plt.Figure][] or None
+ Existing figure or newly created. Returns None if kwarg
+ `create_figure=False`.
+
+ """
+ if BasePlot._fig and BasePlot._fig.is_canvas:
+ return BasePlot._fig.next_subplot
+ else:
+ BasePlot._fig = BaseFigure(palette=self.palette, **kwargs)
+ return BasePlot._fig.next_subplot
+
+ def _draw_line(
+ self,
+ parent: str,
+ child: str | None = None,
+ legend: str | dict = None,
+ **kwargs,
+ ) -> go.Scatter:
+ """Draw a line.
+
+ Unify the style to draw a line, where parent and child
+ (e.g. model - data set or column - distribution) keep the
+ same style (color or dash). A legendgroup title is only added
+ when there is a child element.
+
+ Parameters
+ ----------
+ parent: str
+ Name of the model.
+
+ child: str or None, default=None
+ Data set which is plotted.
+
+ legend: str, dict or None
+ Legend argument provided by the user.
+
+ **kwargs
+ Additional keyword arguments for the trace.
+
+ Returns
+ -------
+ go.Scatter
+ New trace to add to figure.
+
+ """
+ legendgrouptitle = dict(text=parent, font_size=self.label_fontsize)
+ hover = f"(%{{x}}, %{{y}}){parent}{f' - {child}' if child else ''}"
+ return go.Scatter(
+ line=dict(
+ width=self.line_width,
+ color=BasePlot._fig.get_elem(parent),
+ dash=BasePlot._fig.get_elem(child, "dash"),
+ ),
+ marker=dict(
+ symbol=BasePlot._fig.get_elem(child, "marker"),
+ size=self.marker_size,
+ color=BasePlot._fig.get_elem(parent),
+ line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
+ ),
+ hovertemplate=kwargs.pop("hovertemplate", hover),
+ name=kwargs.pop("name", child or parent),
+ legendgroup=kwargs.pop("legendgroup", parent),
+ legendgrouptitle=legendgrouptitle if child else None,
+ showlegend=BasePlot._fig.showlegend(f"{parent}-{child}", legend),
+ **kwargs,
+ )
+
+ @staticmethod
+ def _draw_straight_line(y: SCALAR | str, xaxis: str, yaxis: str):
+ """Draw a line across the axis.
+
+ The line can be either horizontal or diagonal. The line should
+ be used as reference. It's not added to the legend and doesn't
+ show any information on hover.
+
+ Parameters
+ ----------
+ y: int, float or str, default = "diagonal"
+ Coordinates on the y-axis. If a value, draw a horizontal line
+ at that value. If "diagonal", draw a diagonal line from x.
+
+ xaxis: str
+ Name of the x-axis to draw in.
+
+ yaxis: str
+ Name of the y-axis to draw in.
+
+ """
+ BasePlot._fig.figure.add_shape(
+ type="line",
+ x0=0,
+ x1=1,
+ y0=0 if y == "diagonal" else y,
+ y1=1 if y == "diagonal" else y,
+ xref=f"{xaxis} domain",
+ yref=f"{yaxis} domain" if y == "diagonal" else yaxis,
+ line=dict(width=1, color="black", dash="dash"),
+ opacity=0.6,
+ layer="below",
+ )
+
+ def _plot(
+ self,
+ fig: go.Figure | plt.Figure | None = None,
+ ax: plt.Axes | tuple[str, str] | None = None,
+ **kwargs,
+ ) -> go.Figure | plt.Figure | None:
+ """Make the plot.
+
+ Customize the axes to the default layout and plot the figure
+ if it's not part of a canvas.
+
+ Parameters
+ ----------
+ fig: go.Figure, plt.Figure or None
+ Current figure. If None, use `plt.gcf()`.
+
+ ax: plt.Axes, tuple or None, default=None
+ Axis object or names of the axes to update. If None, ignore
+ their update.
+
+ **kwargs
+ Keyword arguments containing the figure's parameters.
+
+ - title: Name of the title or custom configuration.
+ - legend: Whether to show the legend or custom configuration.
+ - xlabel: Label for the x-axis.
+ - ylabel: Label for the y-axis.
+ - xlim: Limits for the x-axis.
+ - ylim: Limits for the y-axis.
+ - figsize: Size of the figure.
+ - filename: Name of the saved file.
+ - plotname: Name of the plot.
+ - display: Whether to show the plot. If None, return the figure.
+
+ Returns
+ -------
+ plt.Figure, go.Figure or None
+ Created figure. Only returned if `display=None`.
+
+ """
+ # Set name with which to save the file
+ if kwargs.get("filename"):
+ if kwargs["filename"].endswith("auto"):
+ name = kwargs["filename"].replace("auto", kwargs["plotname"])
+ else:
+ name = kwargs["filename"]
+ else:
+ name = kwargs.get("plotname")
+
+ fig = fig or BasePlot._fig.figure
+ if BasePlot._fig.backend == "plotly":
+ if ax:
+ fig.update_layout(
+ {
+ f"{ax[0]}_title": dict(
+ text=kwargs.get("xlabel"), font_size=self.label_fontsize
+ ),
+ f"{ax[1]}_title": dict(
+ text=kwargs.get("ylabel"), font_size=self.label_fontsize
+ ),
+ f"{ax[0]}_range": kwargs.get("xlim"),
+ f"{ax[1]}_range": kwargs.get("ylim"),
+ f"{ax[0]}_automargin": True,
+ f"{ax[1]}_automargin": True,
+ }
+ )
+
+ if BasePlot._fig.is_canvas and (title := kwargs.get("title")):
+ # Add a subtitle to a plot in the canvas
+ default_title = {
+ "x": BasePlot._fig.pos[ax[0][5:] or "1"][0],
+ "y": BasePlot._fig.pos[ax[0][5:] or "1"][1] + 0.005,
+ "xref": "paper",
+ "yref": "paper",
+ "xanchor": "center",
+ "yanchor": "bottom",
+ "showarrow": False,
+ "font_size": self.title_fontsize - 4,
+ }
+
+ if isinstance(title, dict):
+ title = {**default_title, **title}
+ else:
+ title = {"text": title, **default_title}
+
+ fig.update_layout(dict(annotations=fig.layout.annotations + (title,)))
+
+ if not BasePlot._fig.is_canvas and kwargs.get("plotname"):
+ default_title = dict(
+ x=0.5,
+ y=1,
+ pad=dict(t=15, b=15),
+ xanchor="center",
+ yanchor="top",
+ xref="paper",
+ font_size=self.title_fontsize,
+ )
+ if isinstance(title := kwargs.get("title"), dict):
+ title = {**default_title, **title}
+ else:
+ title = {"text": title, **default_title}
+
+ default_legend = dict(
+ traceorder="grouped",
+ groupclick=kwargs.get("groupclick", "toggleitem"),
+ font_size=self.label_fontsize,
+ bgcolor="rgba(255, 255, 255, 0.5)",
+ )
+ if isinstance(legend := kwargs.get("legend"), str):
+ position = {}
+ if legend == "upper left":
+ position = dict(x=0.01, y=0.99, xanchor="left", yanchor="top")
+ elif legend == "lower left":
+ position = dict(x=0.01, y=0.01, xanchor="left", yanchor="bottom")
+ elif legend == "upper right":
+ position = dict(x=0.99, y=0.99, xanchor="right", yanchor="top")
+ elif legend == "lower right":
+ position = dict(x=0.99, y=0.01, xanchor="right", yanchor="bottom")
+ elif legend == "upper center":
+ position = dict(x=0.5, y=0.99, xanchor="center", yanchor="top")
+ elif legend == "lower center":
+ position = dict(x=0.5, y=0.01, xanchor="center", yanchor="bottom")
+ elif legend == "center left":
+ position = dict(x=0.01, y=0.5, xanchor="left", yanchor="middle")
+ elif legend == "center right":
+ position = dict(x=0.99, y=0.5, xanchor="right", yanchor="middle")
+ elif legend == "center":
+ position = dict(x=0.5, y=0.5, xanchor="center", yanchor="middle")
+ legend = {**default_legend, **position}
+ elif isinstance(legend, dict):
+ legend = {**default_legend, **legend}
+
+ # Update layout with predefined settings
+ space1 = self.title_fontsize if title.get("text") else 10
+ space2 = self.title_fontsize * int(bool(fig.layout.annotations))
+ fig.update_layout(
+ title=title,
+ legend=legend,
+ showlegend=bool(kwargs.get("legend")),
+ hoverlabel=dict(font_size=self.label_fontsize),
+ font_size=self.tick_fontsize,
+ margin=dict(l=50, b=50, r=0, t=25 + space1 + space2, pad=0),
+ width=kwargs["figsize"][0],
+ height=kwargs["figsize"][1],
+ )
+
+ # Update plot with custom settings
+ fig.update_traces(**self._custom_traces)
+ fig.update_layout(**self._custom_layout)
+
+ if kwargs.get("filename"):
+ if "." not in name or name.endswith(".html"):
+ fig.write_html(name if "." in name else name + ".html")
+ else:
+ fig.write_image(name)
+
+ # Log plot to mlflow run of every model visualized
+ if getattr(self, "experiment", None) and self.log_plots:
+ for m in set(BasePlot._fig.used_models):
+ MlflowClient().log_figure(
+ run_id=m._run.info.run_id,
+ figure=fig,
+ artifact_file=name if "." in name else f"{name}.html",
+ )
+
+ if kwargs.get("display") is True:
+ fig.show()
+ elif kwargs.get("display") is None:
+ return fig
+
+ else:
+ if kwargs.get("title"):
+ ax.set_title(kwargs.get("title"), fontsize=self.title_fontsize, pad=20)
+ if kwargs.get("xlabel"):
+ ax.set_xlabel(kwargs["xlabel"], fontsize=self.label_fontsize, labelpad=12)
+ if kwargs.get("ylabel"):
+ ax.set_ylabel(kwargs["ylabel"], fontsize=self.label_fontsize, labelpad=12)
+ if ax is not None:
+ ax.tick_params(axis="both", labelsize=self.tick_fontsize)
+
+ if kwargs.get("figsize"):
+ # Convert from pixels to inches
+ fig.set_size_inches(
+ kwargs["figsize"][0] // fig.get_dpi(),
+ kwargs["figsize"][1] // fig.get_dpi(),
+ )
+ plt.tight_layout()
+ if kwargs.get("filename"):
+ fig.savefig(name)
+
+ # Log plot to mlflow run of every model visualized
+ if self.experiment and self.log_plots:
+ for m in set(BasePlot._fig.used_models):
+ MlflowClient().log_figure(
+ run_id=m._run.info.run_id,
+ figure=fig,
+ artifact_file=name if "." in name else f"{name}.png",
+ )
+
+ plt.show() if kwargs.get("display") else plt.close()
+ if kwargs.get("display") is None:
+ return fig
+
+ @composed(contextmanager, crash)
+ def canvas(
+ self,
+ rows: INT = 1,
+ cols: INT = 2,
+ *,
+ horizontal_spacing: FLOAT = 0.05,
+ vertical_spacing: FLOAT = 0.07,
+ title: str | dict | None = None,
+ legend: str | dict | None = "out",
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: BOOL = True,
+ ):
+ """Create a figure with multiple plots.
+
+ This `@contextmanager` allows you to draw many plots in one
+ figure. The default option is to add two plots side by side.
+ See the [user guide][canvas] for an example.
+
+ Parameters
+ ----------
+ rows: int, default=1
+ Number of plots in length.
+
+ cols: int, default=2
+ Number of plots in width.
+
+ horizontal_spacing: float, default=0.05
+ Space between subplot rows in normalized plot coordinates.
+ The spacing is relative to the figure's size.
+
+ vertical_spacing: float, default=0.07
+ Space between subplot cols in normalized plot coordinates.
+ The spacing is relative to the figure's size.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: bool, str or dict, default="out"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of plots in the canvas.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool, default=True
+ Whether to render the plot.
+
+ Yields
+ ------
+ [go.Figure][]
+ Plot object.
+
+ """
+ BasePlot._fig = BaseFigure(
+ rows=rows,
+ cols=cols,
+ horizontal_spacing=horizontal_spacing,
+ vertical_spacing=vertical_spacing,
+ palette=self.palette,
+ is_canvas=True,
+ )
+
+ try:
+ yield BasePlot._fig.figure
+ finally:
+ BasePlot._fig.is_canvas = False # Close the canvas
+ self._plot(
+ groupclick="togglegroup",
+ title=title,
+ legend=legend,
+ figsize=figsize or (550 + 350 * cols, 200 + 400 * rows),
+ plotname="canvas",
+ filename=filename,
+ display=display,
+ )
+
+ def reset_aesthetics(self):
+ """Reset the plot [aesthetics][] to their default values."""
+ self._custom_layout = {}
+ self._custom_traces = {}
+ self._aesthetics = Aesthetics(
+ palette=PALETTE,
+ title_fontsize=24,
+ label_fontsize=16,
+ tick_fontsize=12,
+ line_width=2,
+ marker_size=8,
+ )
+
+ def update_layout(self, **kwargs):
+ """Update the properties of the plot's layout.
+
+ Recursively update the structure of the original layout with
+ the values in the arguments.
+
+ Parameters
+ ----------
+ **kwargs
+ Keyword arguments for the figure's [update_layout][] method.
+
+ """
+ self._custom_layout = kwargs
+
+ def update_traces(self, **kwargs):
+ """Update the properties of the plot's traces.
+
+ Recursively update the structure of the original traces with
+ the values in the arguments.
+
+ Parameters
+ ----------
+ **kwargs
+ Keyword arguments for the figure's [update_traces][] method.
+
+ """
+ self._custom_traces = kwargs
diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py
new file mode 100644
index 000000000..105e7cd6d
--- /dev/null
+++ b/atom/plots/dataplot.py
@@ -0,0 +1,985 @@
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module containing the DataPlot class.
+
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from nltk.collocations import (
+ BigramCollocationFinder, QuadgramCollocationFinder,
+ TrigramCollocationFinder,
+)
+from scipy import stats
+from typeguard import typechecked
+
+from atom.plots.base import BasePlot
+from atom.utils.constants import PALETTE
+from atom.utils.types import INT, LEGEND, SEQUENCE, SERIES, SLICE
+from atom.utils.utils import (
+ check_dependency, crash, divide, get_corpus, lst, rnd,
+)
+
+
+@typechecked
+class DataPlot(BasePlot):
+ """Data plots.
+
+ Plots used for understanding and interpretation of the dataset.
+ They are only accessible from atom since. The other runners should
+ be used for model training only, not for data manipulation.
+
+ """
+
+ @crash
+ def plot_correlation(
+ self,
+ columns: slice | SEQUENCE | None = None,
+ method: str = "pearson",
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] = (800, 700),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot a correlation matrix.
+
+ Displays a heatmap showing the correlation between columns in
+ the dataset. The colors red, blue and white stand for positive,
+ negative, and no correlation respectively.
+
+ Parameters
+ ----------
+ columns: slice, sequence or None, default=None
+ Columns to plot. If None, plot all columns in the dataset.
+ Selected categorical columns are ignored.
+
+ method: str, default="pearson"
+ Method of correlation. Choose from: pearson, kendall or
+ spearman.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple, default=(800, 700)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:DataPlot.plot_distribution
+ atom.plots:DataPlot.plot_qq
+ atom.plots:DataPlot.plot_relationships
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.plot_correlation()
+ ```
+
+ """
+ columns = self.branch._get_columns(columns, only_numerical=True)
+ if method.lower() not in ("pearson", "kendall", "spearman"):
+ raise ValueError(
+ f"Invalid value for the method parameter, got {method}. "
+ "Choose from: pearson, kendall or spearman."
+ )
+
+ # Compute the correlation matrix
+ corr = self.dataset[columns].corr(method=method.lower())
+
+ # Generate a mask for the lower triangle
+ # k=1 means keep outermost diagonal line
+ mask = np.zeros_like(corr, dtype=bool)
+ mask[np.triu_indices_from(mask, k=1)] = True
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes(
+ x=(0, 0.87),
+ coloraxis=dict(
+ colorscale="rdbu_r",
+ cmin=-1,
+ cmax=1,
+ title=f"{method.lower()} correlation",
+ font_size=self.label_fontsize,
+ ),
+ )
+
+ fig.add_trace(
+ go.Heatmap(
+ z=corr.mask(mask),
+ x=columns,
+ y=columns,
+ coloraxis=f"coloraxis{xaxis[1:]}",
+ hovertemplate="x:%{x}
y:%{y}
z:%{z}",
+ hoverongaps=False,
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout(
+ {
+ "template": "plotly_white",
+ f"yaxis{yaxis[1:]}_autorange": "reversed",
+ f"xaxis{xaxis[1:]}_showgrid": False,
+ f"yaxis{yaxis[1:]}_showgrid": False,
+ }
+ )
+
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_correlation",
+ filename=filename,
+ display=display,
+ )
+
+ @crash
+ def plot_distribution(
+ self,
+ columns: SLICE = 0,
+ distributions: str | SEQUENCE | None = None,
+ show: INT | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper right",
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot column distributions.
+
+ - For numerical columns, plot the probability density
+ distribution. Additionally, it's possible to plot any of
+ `scipy.stats` distributions fitted to the column.
+ - For categorical columns, plot the class distribution.
+ Only one categorical column can be plotted at the same time.
+
+ !!! tip
+ Use atom's [distribution][atomclassifier-distribution]
+ method to check which distribution fits the column best.
+
+ Parameters
+ ----------
+ columns: int, str, slice or sequence, default=0
+ Columns to plot. I's only possible to plot one categorical
+ column. If more than one categorical columns are selected,
+ all categorical columns are ignored.
+
+ distributions: str, sequence or None, default=None
+ Names of the `scipy.stats` distributions to fit to the
+ columns. If None, a [Gaussian kde distribution][kde] is
+ showed. Only for numerical columns.
+
+ show: int or None, default=None
+ Number of classes (ordered by number of occurrences) to
+ show in the plot. If None, it shows all classes. Only for
+ categorical columns.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None: No title is shown.
+ - If str: Text for the title.
+ - If dict: [title configuration][parameters].
+
+ legend: str, dict or None, default="upper right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the plot's type.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:DataPlot.plot_correlation
+ atom.plots:DataPlot.plot_qq
+ atom.plots:DataPlot.plot_relationships
+
+ Examples
+ --------
+ ```pycon
+ import numpy as np
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ # Add a categorical feature
+ animals = ["cat", "dog", "bird", "lion", "zebra"]
+ probabilities = [0.001, 0.1, 0.2, 0.3, 0.399]
+ X["animals"] = np.random.choice(animals, size=len(X), p=probabilities)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.plot_distribution(columns=[0, 1])
+ atom.plot_distribution(columns=0, distributions=["norm", "invgauss"])
+ atom.plot_distribution(columns="animals")
+ ```
+
+ """
+ columns = self.branch._get_columns(columns)
+ cat_columns = list(self.dataset.select_dtypes(exclude="number").columns)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ if len(columns) == 1 and columns[0] in cat_columns:
+ series = self.dataset[columns[0]].value_counts(ascending=True)
+
+ if show is None or show > len(series):
+ show = len(series)
+ elif show < 1:
+ raise ValueError(
+ "Invalid value for the show parameter."
+ f"Value should be >0, got {show}."
+ )
+
+ color = BasePlot._fig.get_elem()
+ fig.add_trace(
+ go.Bar(
+ x=series,
+ y=series.index,
+ orientation="h",
+ marker=dict(
+ color=f"rgba({color[4:-1]}, 0.2)",
+ line=dict(width=2, color=color),
+ ),
+ hovertemplate="%{x}",
+ name=f"{columns[0]}: {len(series)} classes",
+ showlegend=BasePlot._fig.showlegend("dist", legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Counts",
+ ylim=(len(series) - show - 0.5, len(series) - 0.5),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ plotname="plot_distribution",
+ filename=filename,
+ display=display,
+ )
+
+ else:
+ for col in [c for c in columns if c not in cat_columns]:
+ fig.add_trace(
+ go.Histogram(
+ x=self.dataset[col],
+ histnorm="probability density",
+ marker=dict(
+ color=f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)",
+ line=dict(width=2, color=BasePlot._fig.get_elem(col)),
+ ),
+ nbinsx=40,
+ name="dist",
+ legendgroup=col,
+ legendgrouptitle=dict(text=col, font_size=self.label_fontsize),
+ showlegend=BasePlot._fig.showlegend(f"{col}-dist", legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ x = np.linspace(self.dataset[col].min(), self.dataset[col].max(), 200)
+
+ # Drop missing values for compatibility with scipy.stats
+ missing = self.missing + [np.inf, -np.inf]
+ values = self.dataset[col].replace(missing, np.NaN).dropna()
+
+ if distributions:
+ # Get a line for each distribution
+ for j, dist in enumerate(lst(distributions)):
+ params = getattr(stats, dist).fit(values)
+
+ fig.add_trace(
+ self._draw_line(
+ x=x,
+ y=getattr(stats, dist).pdf(x, *params),
+ parent=col,
+ child=dist,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+ else:
+ # If no distributions specified, draw Gaussian kde
+ fig.add_trace(
+ self._draw_line(
+ x=x,
+ y=stats.gaussian_kde(values)(x),
+ parent=col,
+ child="kde",
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout(dict(barmode="overlay"))
+
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Values",
+ ylabel="Probability density",
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 600),
+ plotname="plot_distribution",
+ filename=filename,
+ display=display,
+ )
+
+ @crash
+ def plot_ngrams(
+ self,
+ ngram: INT | str = "bigram",
+ index: SLICE | None = None,
+ show: INT = 10,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot n-gram frequencies.
+
+ The text for the plot is extracted from the column named
+ `corpus`. If there is no column with that name, an exception
+ is raised. If the documents are not tokenized, the words are
+ separated by spaces.
+
+ !!! tip
+ Use atom's [tokenize][atomclassifier-tokenize] method to
+ separate the words creating n-grams based on their frequency
+ in the corpus.
+
+ Parameters
+ ----------
+ ngram: str or int, default="bigram"
+ Number of contiguous words to search for (size of n-gram).
+ Choose from: words (1), bigrams (2), trigrams (3),
+ quadgrams (4).
+
+ index: int, str, slice, sequence or None, default=None
+ Documents in the corpus to include in the search. If None,
+ it selects all documents in the dataset.
+
+ show: int, default=10
+ Number of n-grams (ordered by number of occurrences) to
+ show in the plot.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of n-grams shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:DataPlot.plot_wordcloud
+
+ Examples
+ --------
+ ```pycon
+ import numpy as np
+ from atom import ATOMClassifier
+ from sklearn.datasets import fetch_20newsgroups
+
+ X, y = fetch_20newsgroups(
+ return_X_y=True,
+ categories=["alt.atheism", "sci.med", "comp.windows.x"],
+ shuffle=True,
+ random_state=1,
+ )
+ X = np.array(X).reshape(-1, 1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.textclean()
+ atom.textnormalize()
+ atom.plot_ngrams()
+ ```
+
+ """
+
+ def get_text(column: SERIES) -> SERIES:
+ """Get the complete corpus as sequence of tokens.
+
+ Parameters
+ ----------
+ column: series
+ Column containing the corpus.
+
+ Returns
+ -------
+ series
+ Corpus of tokens.
+
+ """
+ if isinstance(column.iat[0], str):
+ return column.apply(lambda row: row.split())
+ else:
+ return column
+
+ corpus = get_corpus(self.X)
+ rows = self.dataset.loc[self.branch._get_rows(index, return_test=False)]
+
+ if str(ngram).lower() in ("1", "word", "words"):
+ ngram = "words"
+ series = pd.Series(
+ [word for row in get_text(rows[corpus]) for word in row]
+ ).value_counts(ascending=True)
+ else:
+ if str(ngram).lower() in ("2", "bigram", "bigrams"):
+ ngram, finder = "bigrams", BigramCollocationFinder
+ elif str(ngram).lower() in ("3", "trigram", "trigrams"):
+ ngram, finder = "trigrams", TrigramCollocationFinder
+ elif str(ngram).lower() in ("4", "quadgram", "quadgrams"):
+ ngram, finder = "quadgrams", QuadgramCollocationFinder
+ else:
+ raise ValueError(
+ f"Invalid value for the ngram parameter, got {ngram}. "
+ "Choose from: words, bigram, trigram, quadgram."
+ )
+
+ ngram_fd = finder.from_documents(get_text(rows[corpus])).ngram_fd
+ series = pd.Series(
+ data=[x[1] for x in ngram_fd.items()],
+ index=[" ".join(x[0]) for x in ngram_fd.items()],
+ ).sort_values(ascending=True)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ fig.add_trace(
+ go.Bar(
+ x=(data := series[-show:]),
+ y=data.index,
+ orientation="h",
+ marker=dict(
+ color=f"rgba({BasePlot._fig.get_elem(ngram)[4:-1]}, 0.2)",
+ line=dict(width=2, color=BasePlot._fig.get_elem(ngram)),
+ ),
+ hovertemplate="%{x}",
+ name=f"Total {ngram}: {len(series)}",
+ legendgroup=ngram,
+ showlegend=BasePlot._fig.showlegend(ngram, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Counts",
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ plotname="plot_ngrams",
+ filename=filename,
+ display=display,
+ )
+
+ @crash
+ def plot_qq(
+ self,
+ columns: SLICE = 0,
+ distributions: str | SEQUENCE = "norm",
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot a quantile-quantile plot.
+
+ Columns are distinguished by color and the distributions are
+ distinguished by marker type. Missing values are ignored.
+
+ Parameters
+ ----------
+ columns: int, str, slice or sequence, default=0
+ Columns to plot. Selected categorical columns are ignored.
+
+ distributions: str or sequence, default="norm"
+ Names of the `scipy.stats` distributions to fit to the
+ columns.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:DataPlot.plot_correlation
+ atom.plots:DataPlot.plot_distribution
+ atom.plots:DataPlot.plot_relationships
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.plot_qq(columns=[5, 6])
+ atom.plot_qq(columns=0, distributions=["norm", "invgauss", "triang"])
+ ```
+
+ """
+ columns = self.branch._get_columns(columns)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ percentiles = np.linspace(0, 100, 101)
+ for col in columns:
+ # Drop missing values for compatibility with scipy.stats
+ missing = self.missing + [np.inf, -np.inf]
+ values = self.dataset[col].replace(missing, np.NaN).dropna()
+
+ for dist in lst(distributions):
+ stat = getattr(stats, dist)
+ params = stat.fit(values)
+ samples = stat.rvs(*params, size=101, random_state=self.random_state)
+
+ fig.add_trace(
+ self._draw_line(
+ x=np.percentile(samples, percentiles),
+ y=np.percentile(values, percentiles),
+ mode="markers",
+ parent=col,
+ child=dist,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis)
+
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Theoretical quantiles",
+ ylabel="Observed quantiles",
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 600),
+ plotname="plot_qq",
+ filename=filename,
+ display=display,
+ )
+
+ @crash
+ def plot_relationships(
+ self,
+ columns: slice | SEQUENCE = (0, 1, 2),
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] = (900, 900),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot pairwise relationships in a dataset.
+
+ Creates a grid of axes such that each numerical column appears
+ once on the x-axes and once on the y-axes. The bottom triangle
+ contains scatter plots (max 250 random samples), the diagonal
+ plots contain column distributions, and the upper triangle
+ contains contour histograms for all samples in the columns.
+
+ Parameters
+ ----------
+ columns: slice or sequence, default=(0, 1, 2)
+ Columns to plot. Selected categorical columns are ignored.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple, default=(900, 900)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:DataPlot.plot_correlation
+ atom.plots:DataPlot.plot_distribution
+ atom.plots:DataPlot.plot_qq
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.plot_relationships(columns=[0, 4, 5])
+ ```
+
+ """
+ columns = self.branch._get_columns(columns, only_numerical=True)
+
+ # Use max 250 samples to not clutter the plot
+ sample = lambda col: self.dataset[col].sample(
+ n=min(len(self.dataset), 250), random_state=self.random_state
+ )
+
+ fig = self._get_figure()
+ color = BasePlot._fig.get_elem()
+ for i in range(len(columns)**2):
+ x, y = i // len(columns), i % len(columns)
+
+ # Calculate the distance between subplots
+ offset = divide(0.0125, (len(columns) - 1))
+
+ # Calculate the size of the subplot
+ size = (1 - ((offset * 2) * (len(columns) - 1))) / len(columns)
+
+ # Determine the position for the axes
+ x_pos = y * (size + 2 * offset)
+ y_pos = (len(columns) - x - 1) * (size + 2 * offset)
+
+ xaxis, yaxis = BasePlot._fig.get_axes(
+ x=(x_pos, rnd(x_pos + size)),
+ y=(y_pos, rnd(y_pos + size)),
+ coloraxis=dict(
+ colorscale=PALETTE.get(color, "Blues"),
+ cmin=0,
+ cmax=len(self.dataset),
+ showscale=False,
+ )
+ )
+
+ if x == y:
+ fig.add_trace(
+ go.Histogram(
+ x=self.dataset[columns[x]],
+ marker=dict(
+ color=f"rgba({color[4:-1]}, 0.2)",
+ line=dict(width=2, color=color),
+ ),
+ name=columns[x],
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+ elif x > y:
+ fig.add_trace(
+ go.Scatter(
+ x=sample(columns[y]),
+ y=sample(columns[x]),
+ mode="markers",
+ marker=dict(color=color),
+ hovertemplate="(%{x}, %{y})",
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+ elif y > x:
+ fig.add_trace(
+ go.Histogram2dContour(
+ x=self.dataset[columns[y]],
+ y=self.dataset[columns[x]],
+ coloraxis=f"coloraxis{xaxis[1:]}",
+ hovertemplate="x:%{x}
y:%{y}
z:%{z}",
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ if x < len(columns) - 1:
+ fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False})
+ if y > 0:
+ fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False})
+
+ self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel=columns[y] if x == len(columns) - 1 else None,
+ ylabel=columns[x] if y == 0 else None,
+ )
+
+ return self._plot(
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 900),
+ plotname="plot_relationships",
+ filename=filename,
+ display=display,
+ )
+
+ @crash
+ def plot_wordcloud(
+ self,
+ index: SLICE | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ **kwargs,
+ ) -> go.Figure | None:
+ """Plot a wordcloud from the corpus.
+
+ The text for the plot is extracted from the column named
+ `corpus`. If there is no column with that name, an exception
+ is raised.
+
+ Parameters
+ ----------
+ index: int, str, slice, sequence or None, default=None
+ Documents in the corpus to include in the wordcloud. If
+ None, it selects all documents in the dataset.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ **kwargs
+ Additional keyword arguments for the [Wordcloud][] object.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:DataPlot.plot_ngrams
+ atom.plots:PredictionPlot.plot_pipeline
+
+ Examples
+ --------
+ ```pycon
+ import numpy as np
+ from atom import ATOMClassifier
+ from sklearn.datasets import fetch_20newsgroups
+
+ X, y = fetch_20newsgroups(
+ return_X_y=True,
+ categories=["alt.atheism", "sci.med", "comp.windows.x"],
+ shuffle=True,
+ random_state=1,
+ )
+ X = np.array(X).reshape(-1, 1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.textclean()
+ atom.textnormalize()
+ atom.plot_wordcloud()
+ ```
+
+ """
+
+ def get_text(column):
+ """Get the complete corpus as one long string."""
+ if isinstance(column.iat[0], str):
+ return " ".join(column)
+ else:
+ return " ".join([" ".join(row) for row in column])
+
+ check_dependency("wordcloud")
+ from wordcloud import WordCloud
+
+ corpus = get_corpus(self.X)
+ rows = self.dataset.loc[self.branch._get_rows(index, return_test=False)]
+
+ wordcloud = WordCloud(
+ width=figsize[0],
+ height=figsize[1],
+ background_color=kwargs.pop("background_color", "white"),
+ random_state=kwargs.pop("random_state", self.random_state),
+ **kwargs,
+ )
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ fig.add_trace(
+ go.Image(
+ z=wordcloud.generate(get_text(rows[corpus])),
+ hoverinfo="skip",
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout(
+ {
+ f"xaxis{xaxis[1:]}_showticklabels": False,
+ f"yaxis{xaxis[1:]}_showticklabels": False,
+ }
+ )
+
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 600),
+ plotname="plot_wordcloud",
+ filename=filename,
+ display=display,
+ )
diff --git a/atom/plots/featureselectionplot.py b/atom/plots/featureselectionplot.py
new file mode 100644
index 000000000..79f83e1f3
--- /dev/null
+++ b/atom/plots/featureselectionplot.py
@@ -0,0 +1,428 @@
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module containing the FeatureSelectionPlot class.
+
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import plotly.graph_objects as go
+from sklearn.utils.metaestimators import available_if
+from typeguard import typechecked
+
+from atom.plots.base import BasePlot
+from atom.utils.types import INT, LEGEND
+from atom.utils.utils import crash, has_attr
+
+
+@typechecked
+class FeatureSelectionPlot(BasePlot):
+ """Feature selection plots.
+
+ These plots are accessible from atom or from the FeatureSelector
+ class when the appropriate feature selection strategy is used.
+
+ """
+
+ @available_if(has_attr("pca"))
+ @crash
+ def plot_components(
+ self,
+ show: INT | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the explained variance ratio per component.
+
+ Kept components are colored and discarted components are
+ transparent. This plot is available only when feature selection
+ was applied with strategy="pca".
+
+ Parameters
+ ----------
+ show: int or None, default=None
+ Number of components to show. None to show all.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of components shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:FeatureSelectionPlot.plot_pca
+ atom.plots:FeatureSelectionPlot.plot_rfecv
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.feature_selection("pca", n_features=5)
+ atom.plot_components(show=10)
+ ```
+
+ """
+ if show is None or show > self.pca.components_.shape[0]:
+ # Limit max features shown to avoid maximum figsize error
+ show = min(200, self.pca.components_.shape[0])
+ elif show < 1:
+ raise ValueError(
+ "Invalid value for the show parameter. "
+ f"Value should be >0, got {show}."
+ )
+
+ # Get the variance ratio per component
+ variance = np.array(self.pca.explained_variance_ratio_)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ # Create color scheme: first normal and then fully transparent
+ color = BasePlot._fig.get_elem("components")
+ opacity = [0.2] * self.pca._comps + [0] * (len(variance) - self.pca._comps)
+
+ fig.add_trace(
+ go.Bar(
+ x=variance,
+ y=[f"pca{str(i)}" for i in range(len(variance))],
+ orientation="h",
+ marker=dict(
+ color=[f"rgba({color[4:-1]}, {o})" for o in opacity],
+ line=dict(width=2, color=color),
+ ),
+ hovertemplate="%{x}",
+ name=f"Variance retained: {variance[:self.pca._comps].sum():.3f}",
+ legendgroup="components",
+ showlegend=BasePlot._fig.showlegend("components", legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout({f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending")})
+
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Explained variance ratio",
+ ylim=(len(variance) - show - 0.5, len(variance) - 0.5),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ plotname="plot_components",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_attr("pca"))
+ @crash
+ def plot_pca(
+ self,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the explained variance ratio vs number of components.
+
+ If the underlying estimator is [PCA][] (for dense datasets),
+ all possible components are plotted. If the underlying estimator
+ is [TruncatedSVD][] (for sparse datasets), it only shows the
+ selected components. The star marks the number of components
+ selected by the user. This plot is available only when feature
+ selection was applied with strategy="pca".
+
+ Parameters
+ ----------
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:FeatureSelectionPlot.plot_components
+ atom.plots:FeatureSelectionPlot.plot_rfecv
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.feature_selection("pca", n_features=5)
+ atom.plot_pca()
+ ```
+
+ """
+ # Create star symbol at selected number of components
+ symbols = ["circle"] * self.pca.n_features_in_
+ symbols[self.pca._comps - 1] = "star"
+ sizes = [self.marker_size] * self.pca.n_features_in_
+ sizes[self.pca._comps - 1] = self.marker_size * 1.5
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ fig.add_trace(
+ go.Scatter(
+ x=tuple(range(1, self.pca.n_features_in_ + 1)),
+ y=np.cumsum(self.pca.explained_variance_ratio_),
+ mode="lines+markers",
+ line=dict(width=self.line_width, color=BasePlot._fig.get_elem("pca")),
+ marker=dict(
+ symbol=symbols,
+ size=sizes,
+ line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
+ opacity=1,
+ ),
+ hovertemplate="%{y}",
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout(
+ {
+ "hovermode": "x",
+ f"xaxis{xaxis[1:]}_showspikes": True,
+ f"yaxis{yaxis[1:]}_showspikes": True,
+ }
+ )
+
+ margin = self.pca.n_features_in_ / 30
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="First N principal components",
+ ylabel="Cumulative variance ratio",
+ xlim=(1 - margin, self.pca.n_features_in_ - 1 + margin),
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_pca",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_attr("rfecv"))
+ @crash
+ def plot_rfecv(
+ self,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the rfecv results.
+
+ Plot the scores obtained by the estimator fitted on every
+ subset of the dataset. Only available when feature selection
+ was applied with strategy="rfecv".
+
+ Parameters
+ ----------
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:FeatureSelectionPlot.plot_components
+ atom.plots:FeatureSelectionPlot.plot_pca
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.feature_selection("rfecv", solver="Tree")
+ atom.plot_rfecv()
+ ```
+
+ """
+ try: # Define the y-label for the plot
+ ylabel = self.rfecv.get_params()["scoring"].name
+ except AttributeError:
+ ylabel = "accuracy" if self.goal.startswith("class") else "r2"
+
+ x = range(self.rfecv.min_features_to_select, self.rfecv.n_features_in_ + 1)
+
+ # Create star symbol at selected number of features
+ sizes = [6] * len(x)
+ sizes[self.rfecv.n_features_ - self.rfecv.min_features_to_select] = 12
+ symbols = ["circle"] * len(x)
+ symbols[self.rfecv.n_features_ - self.rfecv.min_features_to_select] = "star"
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ mean = self.rfecv.cv_results_["mean_test_score"]
+ std = self.rfecv.cv_results_["std_test_score"]
+
+ fig.add_trace(
+ go.Scatter(
+ x=list(x),
+ y=mean,
+ mode="lines+markers",
+ line=dict(width=self.line_width, color=BasePlot._fig.get_elem("rfecv")),
+ marker=dict(
+ symbol=symbols,
+ size=sizes,
+ line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
+ opacity=1,
+ ),
+ name=ylabel,
+ legendgroup="rfecv",
+ showlegend=BasePlot._fig.showlegend("rfecv", legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ # Add error bands
+ fig.add_traces(
+ [
+ go.Scatter(
+ x=tuple(x),
+ y=mean + std,
+ mode="lines",
+ line=dict(width=1, color=BasePlot._fig.get_elem("rfecv")),
+ hovertemplate="%{y}upper bound",
+ legendgroup="rfecv",
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ ),
+ go.Scatter(
+ x=tuple(x),
+ y=mean - std,
+ mode="lines",
+ line=dict(width=1, color=BasePlot._fig.get_elem("rfecv")),
+ fill="tonexty",
+ fillcolor=f"rgba{BasePlot._fig.get_elem('rfecv')[3:-1]}, 0.2)",
+ hovertemplate="%{y}lower bound",
+ legendgroup="rfecv",
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ ),
+ ]
+ )
+
+ fig.update_layout({"hovermode": "x unified"})
+
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ groupclick="togglegroup",
+ xlabel="Number of features",
+ ylabel=ylabel,
+ xlim=(min(x) - len(x) / 30, max(x) + len(x) / 30),
+ ylim=(min(mean) - 3 * max(std), max(mean) + 3 * max(std)),
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_rfecv",
+ filename=filename,
+ display=display,
+ )
diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py
new file mode 100644
index 000000000..08f09893a
--- /dev/null
+++ b/atom/plots/hyperparametertuningplot.py
@@ -0,0 +1,1453 @@
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module containing the HyperparameterTuningPlot class.
+
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+
+import numpy as np
+import plotly.graph_objects as go
+from optuna.importance import FanovaImportanceEvaluator
+from optuna.trial import TrialState
+from optuna.visualization._parallel_coordinate import (
+ _get_dims_from_info, _get_parallel_coordinate_info,
+)
+from optuna.visualization._terminator_improvement import _get_improvement_info
+from optuna.visualization._utils import _is_log_scale
+from sklearn.utils._bunch import Bunch
+from typeguard import typechecked
+
+from atom.plots.base import BasePlot
+from atom.utils.constants import PALETTE
+from atom.utils.types import INT, INT_TYPES, LEGEND, MODEL, SEQUENCE
+from atom.utils.utils import (
+ check_dependency, check_hyperparams, composed, crash, divide, it, lst,
+ plot_from_model, rnd,
+)
+
+
+@typechecked
+class HyperparameterTuningPlot(BasePlot):
+ """Hyperparameter tuning plots.
+
+ Plots that help interpret the model's study and corresponding
+ trials. These plots are accessible from the runners or from the
+ models. If called from a runner, the `models` parameter has to be
+ specified (if None, uses all models). If called from a model, that
+ model is used and the `models` parameter becomes unavailable.
+
+ """
+
+ @composed(crash, plot_from_model)
+ def plot_edf(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ metric: INT | str | SEQUENCE | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper left",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the Empirical Distribution Function of a study.
+
+ Use this plot to analyze and improve hyperparameter search
+ spaces. The EDF assumes that the value of the objective
+ function is in accordance with the uniform distribution over
+ the objective space. This plot is only available for models
+ that ran [hyperparameter tuning][].
+
+ !!! note
+ Only complete trials are considered when plotting the EDF.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models that used hyperparameter
+ tuning are selected.
+
+ metric: int, str, sequence or None, default=None
+ Metric to plot (only for multi-metric runs). If str, add `+`
+ between options to select more than one. If None, the metric
+ used to run the pipeline is selected.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper left"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:HyperparameterTuningPlot.plot_hyperparameters
+ atom.plots:HyperparameterTuningPlot.plot_trials
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from optuna.distributions import IntDistribution
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+
+ # Run three models with different search spaces
+ atom.run(
+ models="RF_1",
+ n_trials=10,
+ ht_params={"distributions": {"n_estimators": IntDistribution(6, 10)}},
+ )
+ atom.run(
+ models="RF_2",
+ n_trials=10,
+ ht_params={"distributions": {"n_estimators": IntDistribution(11, 15)}},
+ )
+ atom.run(
+ models="RF_3",
+ n_trials=10,
+ ht_params={"distributions": {"n_estimators": IntDistribution(16, 20)}},
+ )
+
+ atom.plot_edf()
+ ```
+
+ """
+ models = check_hyperparams(models, "plot_edf")
+ metric = self._get_metric(metric, max_one=False)
+
+ values = []
+ for m in models:
+ values.append([])
+ for met in metric:
+ values[-1].append(np.array([lst(row)[met] for row in m.trials["score"]]))
+
+ x_min = np.nanmin(np.array(values))
+ x_max = np.nanmax(np.array(values))
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m, val in zip(models, values):
+ for met in metric:
+ fig.add_trace(
+ self._draw_line(
+ x=(x := np.linspace(x_min, x_max, 100)),
+ y=np.sum(val[met][:, np.newaxis] <= x, axis=0) / len(val[met]),
+ parent=m.name,
+ child=self._metric[met].name,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ ylim=(0, 1),
+ xlabel="Score",
+ ylabel="Cumulative Probability",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_edf",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model)
+ def plot_hyperparameter_importance(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ metric: int | str = 0,
+ show: INT | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot a model's hyperparameter importance.
+
+ The hyperparameter importance are calculated using the
+ [fANOVA][] importance evaluator. The sum of importances for all
+ parameters (per model) is 1. This plot is only available for
+ models that ran [hyperparameter tuning][].
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models that used hyperparameter
+ tuning are selected.
+
+ metric: int or str, default=0
+ Metric to plot (only for multi-metric runs).
+
+ show: int or None, default=None
+ Number of hyperparameters (ordered by importance) to show.
+ None to show all.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of hyperparameters shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_feature_importance
+ atom.plots:HyperparameterTuningPlot.plot_hyperparameters
+ atom.plots:HyperparameterTuningPlot.plot_trials
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["ET", "RF"], n_trials=10)
+ atom.plot_hyperparameter_importance()
+ ```
+
+ """
+ models = check_hyperparams(models, "plot_hyperparameter_importance")
+ params = len(set([k for m in lst(models) for k in m._ht["distributions"]]))
+ met = self._get_metric(metric, max_one=True)
+
+ if show is None or show > params:
+ # Limit max features shown to avoid maximum figsize error
+ show = min(200, params)
+ elif show < 1:
+ raise ValueError(
+ f"Invalid value for the show parameter. Value should be >0, got {show}."
+ )
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ importances = FanovaImportanceEvaluator(seed=self.random_state).evaluate(
+ study=m.study,
+ target=None if len(self._metric) == 1 else lambda x: x.values[met],
+ )
+
+ fig.add_trace(
+ go.Bar(
+ x=np.array(list(importances.values())) / sum(importances.values()),
+ y=list(importances.keys()),
+ orientation="h",
+ marker=dict(
+ color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+ line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
+ ),
+ hovertemplate="%{x}",
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=BasePlot._fig.showlegend(m.name, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout(
+ {
+ f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
+ "bargroupgap": 0.05,
+ }
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Normalized hyperparameter importance",
+ ylim=(params - show - 0.5, params - 0.5),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ plotname="plot_hyperparameter_importance",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_hyperparameters(
+ self,
+ models: INT | str | MODEL | None = None,
+ params: str | slice | SEQUENCE = (0, 1),
+ metric: int | str = 0,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot hyperparameter relationships in a study.
+
+ A model's hyperparameters are plotted against each other. The
+ corresponding metric scores are displayed in a contour plot.
+ The markers are the trials in the study. This plot is only
+ available for models that ran [hyperparameter tuning][].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_hyperparameters()`.
+
+ params: str, slice or sequence, default=(0, 1)
+ Hyperparameters to plot. Use a sequence or add `+` between
+ options to select more than one.
+
+ metric: int or str, default=0
+ Metric to plot (only for multi-metric runs).
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of hyperparameters shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:HyperparameterTuningPlot.plot_hyperparameter_importance
+ atom.plots:HyperparameterTuningPlot.plot_parallel_coordinate
+ atom.plots:HyperparameterTuningPlot.plot_trials
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run("LR", n_trials=15)
+ atom.plot_hyperparameters(params=(0, 1, 2))
+ ```
+
+ """
+ m = check_hyperparams(models, "plot_hyperparameters")[0]
+
+ if len(params := self._get_hyperparams(params, models)) < 2:
+ raise ValueError(
+ "Invalid value for the hyperparameters parameter. A minimum "
+ f"of two parameters is required, got {len(params)}."
+ )
+
+ met = self._get_metric(metric, max_one=True)
+
+ fig = self._get_figure()
+ for i in range((length := len(params) - 1) ** 2):
+ x, y = i // length, i % length
+
+ if y <= x:
+ # Calculate the size of the subplot
+ size = 1 / length
+
+ # Determine the position for the axes
+ x_pos = y * size
+ y_pos = (length - x - 1) * size
+
+ xaxis, yaxis = BasePlot._fig.get_axes(
+ x=(x_pos, rnd(x_pos + size)),
+ y=(y_pos, rnd(y_pos + size)),
+ coloraxis=dict(
+ axes="99",
+ colorscale=PALETTE.get(BasePlot._fig.get_elem(m.name), "Blues"),
+ cmin=np.nanmin(
+ m.trials.apply(lambda x: lst(x["score"])[met], axis=1)
+ ),
+ cmax=np.nanmax(
+ m.trials.apply(lambda x: lst(x["score"])[met], axis=1)
+ ),
+ showscale=False,
+ )
+ )
+
+ x_values = lambda row: row["params"].get(params[y], None)
+ y_values = lambda row: row["params"].get(params[x + 1], None)
+
+ fig.add_trace(
+ go.Scatter(
+ x=m.trials.apply(x_values, axis=1),
+ y=m.trials.apply(y_values, axis=1),
+ mode="markers",
+ marker=dict(
+ size=self.marker_size,
+ color=BasePlot._fig.get_elem(m.name),
+ line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
+ ),
+ customdata=list(
+ zip(
+ m.trials.index.tolist(),
+ m.trials.apply(lambda x: lst(x["score"])[met], axis=1),
+ )
+ ),
+ hovertemplate=(
+ f"{params[y]}:%{{x}}
"
+ f"{params[x + 1]}:%{{y}}
"
+ f"{self._metric[met].name}:%{{customdata[1]:.4f}}"
+ "Trial %{customdata[0]}"
+ ),
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.add_trace(
+ go.Contour(
+ x=m.trials.apply(x_values, axis=1),
+ y=m.trials.apply(y_values, axis=1),
+ z=m.trials.apply(lambda i: lst(i["score"])[met], axis=1),
+ contours=dict(
+ showlabels=True,
+ labelfont=dict(size=self.tick_fontsize, color="white")
+ ),
+ coloraxis="coloraxis99",
+ hoverinfo="skip",
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ if _is_log_scale(m.study.trials, params[y]):
+ fig.update_layout({f"xaxis{xaxis[1:]}_type": "log"})
+ if _is_log_scale(m.study.trials, params[x + 1]):
+ fig.update_layout({f"yaxis{xaxis[1:]}_type": "log"})
+
+ if x < length - 1:
+ fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False})
+ if y > 0:
+ fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False})
+
+ fig.update_layout(
+ {
+ "template": "plotly_white",
+ f"xaxis{xaxis[1:]}_showgrid": False,
+ f"yaxis{yaxis[1:]}_showgrid": False,
+ f"xaxis{yaxis[1:]}_zeroline": False,
+ f"yaxis{yaxis[1:]}_zeroline": False,
+ }
+ )
+
+ self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel=params[y] if x == length - 1 else None,
+ ylabel=params[x + 1] if y == 0 else None,
+ )
+
+ BasePlot._fig.used_models.append(m)
+ return self._plot(
+ title=title,
+ legend=legend,
+ figsize=figsize or (800 + 100 * length, 500 + 100 * length),
+ plotname="plot_hyperparameters",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_parallel_coordinate(
+ self,
+ models: INT | str | MODEL | None = None,
+ params: str | slice | SEQUENCE | None = None,
+ metric: INT | str = 0,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot high-dimensional parameter relationships in a study.
+
+ Every line of the plot represents one trial. This plot is only
+ available for models that ran [hyperparameter tuning][].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_parallel_coordinate()`.
+
+ params: str, slice, sequence or None, default=None
+ Hyperparameters to plot. Use a sequence or add `+` between
+ options to select more than one. If None, all the model's
+ hyperparameters are selected.
+
+ metric: int or str, default=0
+ Metric to plot (only for multi-metric runs).
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of hyperparameters shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:HyperparameterTuningPlot.plot_edf
+ atom.plots:HyperparameterTuningPlot.plot_hyperparameter_importance
+ atom.plots:HyperparameterTuningPlot.plot_hyperparameters
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run("RF", n_trials=15)
+ atom.plot_parallel_coordinate(params=slice(1, 5))
+ ```
+
+ """
+
+ def sort_mixed_types(values: list[str]) -> list[str]:
+ """Sort a sequence of numbers and strings.
+
+ Numbers are converted and take precedence over strings.
+
+ Parameters
+ ----------
+ values: list
+ Values to sort.
+
+ Returns
+ -------
+ list of str
+ Sorted values.
+
+ """
+ numbers, categorical = [], []
+ for elem in values:
+ try:
+ numbers.append(it(float(elem)))
+ except (TypeError, ValueError):
+ categorical.append(str(elem))
+
+ return list(map(str, sorted(numbers))) + sorted(categorical)
+
+ m = check_hyperparams(models, "plot_parallel_coordinate")[0]
+ params = self._get_hyperparams(params, models)
+ met = self._get_metric(metric, max_one=True)
+
+ dims = _get_dims_from_info(
+ _get_parallel_coordinate_info(
+ study=m.study,
+ params=params,
+ target=None if len(self._metric) == 1 else lambda x: x.values[met],
+ target_name=self._metric[met].name,
+ )
+ )
+
+ # Clean up dimensions for nicer view
+ for d in [dims[0]] + sorted(dims[1:], key=lambda x: params.index(x["label"])):
+ if "ticktext" in d:
+ # Skip processing for logarithmic params
+ if all(isinstance(i, INT_TYPES) for i in d["values"]):
+ # Order categorical values
+ mapping = [d["ticktext"][i] for i in d["values"]]
+ d["ticktext"] = sort_mixed_types(d["ticktext"])
+ d["values"] = [d["ticktext"].index(v) for v in mapping]
+ else:
+ # Round numerical values
+ d["tickvals"] = list(
+ map(rnd, np.linspace(min(d["values"]), max(d["values"]), 5))
+ )
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes(
+ coloraxis=dict(
+ colorscale=PALETTE.get(BasePlot._fig.get_elem(m.name), "Blues"),
+ cmin=min(dims[0]["values"]),
+ cmax=max(dims[0]["values"]),
+ title=self._metric[met].name,
+ font_size=self.label_fontsize,
+ )
+ )
+
+ fig.add_trace(
+ go.Parcoords(
+ dimensions=dims,
+ line=dict(
+ color=dims[0]["values"],
+ coloraxis=f"coloraxis{xaxis[1:]}",
+ ),
+ unselected=dict(line=dict(color="gray", opacity=0.5)),
+ labelside="bottom",
+ labelfont=dict(size=self.label_fontsize),
+ )
+ )
+
+ BasePlot._fig.used_models.append(m)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ title=title,
+ legend=legend,
+ figsize=figsize or (700 + len(params) * 50, 600),
+ plotname="plot_parallel_coordinate",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_pareto_front(
+ self,
+ models: INT | str | MODEL | None = None,
+ metric: str | SEQUENCE | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the Pareto front of a study.
+
+ Shows the trial scores plotted against each other. The marker's
+ colors indicate the trial number. This plot is only available
+ for models that ran [multi-metric runs][] with
+ [hyperparameter tuning][].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_pareto_front()`.
+
+ metric: str, sequence or None, default=None
+ Metrics to plot. Use a sequence or add `+` between options
+ to select more than one. If None, the metrics used to run
+ the pipeline are selected.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of metrics shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:HyperparameterTuningPlot.plot_edf
+ atom.plots:HyperparameterTuningPlot.plot_slice
+ atom.plots:HyperparameterTuningPlot.plot_trials
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(
+ models="RF",
+ metric=["f1", "accuracy", "recall"],
+ n_trials=15,
+ )
+ atom.plot_pareto_front()
+ ```
+
+ """
+ m = check_hyperparams(models, "plot_pareto_front")[0]
+
+ if len(metric := self._get_metric(metric, max_one=False)) < 2:
+ raise ValueError(
+ "Invalid value for the metric parameter. A minimum "
+ f"of two metrics are required, got {len(metric)}."
+ )
+
+ fig = self._get_figure()
+ for i in range((length := len(metric) - 1) ** 2):
+ x, y = i // length, i % length
+
+ if y <= x:
+ # Calculate the distance between subplots
+ offset = divide(0.0125, length - 1)
+
+ # Calculate the size of the subplot
+ size = (1 - ((offset * 2) * (length - 1))) / length
+
+ # Determine the position for the axes
+ x_pos = y * (size + 2 * offset)
+ y_pos = (length - x - 1) * (size + 2 * offset)
+
+ xaxis, yaxis = BasePlot._fig.get_axes(
+ x=(x_pos, rnd(x_pos + size)),
+ y=(y_pos, rnd(y_pos + size)),
+ )
+
+ fig.add_trace(
+ go.Scatter(
+ x=m.trials.apply(lambda row: row["score"][y], axis=1),
+ y=m.trials.apply(lambda row: row["score"][x + 1], axis=1),
+ mode="markers",
+ marker=dict(
+ size=self.marker_size,
+ color=m.trials.index,
+ colorscale="Teal",
+ line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
+ ),
+ customdata=m.trials.index,
+ hovertemplate="(%{x}, %{y})Trial %{customdata}",
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ if x < len(metric) - 1:
+ fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False})
+ if y > 0:
+ fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False})
+
+ self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel=self._metric[y].name if x == length - 1 else None,
+ ylabel=self._metric[x + 1].name if y == 0 else None,
+ )
+
+ BasePlot._fig.used_models.append(m)
+ return self._plot(
+ title=title,
+ legend=legend,
+ figsize=figsize or (500 + 100 * length, 500 + 100 * length),
+ plotname="plot_pareto_front",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_slice(
+ self,
+ models: INT | str | MODEL | None = None,
+ params: str | slice | SEQUENCE | None = None,
+ metric: INT | str | SEQUENCE | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the parameter relationship in a study.
+
+ The color of the markers indicate the trial. This plot is only
+ available for models that ran [hyperparameter tuning][].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_slice()`.
+
+ params: str, slice, sequence or None, default=None
+ Hyperparameters to plot. Use a sequence or add `+` between
+ options to select more than one. If None, all the model's
+ hyperparameters are selected.
+
+ metric: int or str, default=None
+ Metric to plot (only for multi-metric runs). If str, add `+`
+ between options to select more than one. If None, the metric
+ used to run the pipeline is selected.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of hyperparameters shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:HyperparameterTuningPlot.plot_edf
+ atom.plots:HyperparameterTuningPlot.plot_hyperparameters
+ atom.plots:HyperparameterTuningPlot.plot_parallel_coordinate
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(
+ models="RF",
+ metric=["f1", "recall"],
+ n_trials=15,
+ )
+ atom.plot_slice(params=(0, 1, 2))
+ ```
+
+ """
+ m = check_hyperparams(models, "plot_slice")[0]
+ params = self._get_hyperparams(params, models)
+ metric = self._get_metric(metric, max_one=False)
+
+ fig = self._get_figure()
+ for i in range(len(params) * len(metric)):
+ x, y = i // len(params), i % len(params)
+
+ # Calculate the distance between subplots
+ x_offset = divide(0.0125, (len(params) - 1))
+ y_offset = divide(0.0125, (len(metric) - 1))
+
+ # Calculate the size of the subplot
+ x_size = (1 - ((x_offset * 2) * (len(params) - 1))) / len(params)
+ y_size = (1 - ((y_offset * 2) * (len(metric) - 1))) / len(metric)
+
+ # Determine the position for the axes
+ x_pos = y * (x_size + 2 * x_offset)
+ y_pos = (len(metric) - x - 1) * (y_size + 2 * y_offset)
+
+ xaxis, yaxis = BasePlot._fig.get_axes(
+ x=(x_pos, rnd(x_pos + x_size)),
+ y=(y_pos, rnd(y_pos + y_size)),
+ )
+
+ fig.add_trace(
+ go.Scatter(
+ x=m.trials.apply(lambda r: r["params"].get(params[y], None), axis=1),
+ y=m.trials.apply(lambda r: lst(r["score"])[x], axis=1),
+ mode="markers",
+ marker=dict(
+ size=self.marker_size,
+ color=m.trials.index,
+ colorscale="Teal",
+ line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
+ ),
+ customdata=m.trials.index,
+ hovertemplate="(%{x}, %{y})Trial %{customdata}",
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ if _is_log_scale(m.study.trials, params[y]):
+ fig.update_layout({f"xaxis{xaxis[1:]}_type": "log"})
+
+ if x < len(metric) - 1:
+ fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False})
+ if y > 0:
+ fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False})
+
+ self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel=params[y] if x == len(metric) - 1 else None,
+ ylabel=self._metric[x].name if y == 0 else None,
+ )
+
+ BasePlot._fig.used_models.append(m)
+ return self._plot(
+ title=title,
+ legend=legend,
+ figsize=figsize or (800 + 100 * len(params), 500 + 100 * len(metric)),
+ plotname="plot_slice",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model)
+ def plot_terminator_improvement(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the potentials for future objective improvement.
+
+ This function visualizes the objective improvement potentials.
+ It helps to determine whether you should continue the
+ optimization or not. The evaluated error is also plotted. Note
+ that this function may take some time to compute the improvement
+ potentials. This plot is only available for models that ran
+ [hyperparameter tuning][].
+
+ !!! warning
+ * The plot_terminator_improvement method is only available
+ for models that ran [hyperparameter tuning][] using
+ cross-validation, e.g. using `ht_params={'cv': 5}`.
+ * This method can be slow. Results are cached to fasten
+ repeated calls.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models that used hyperparameter
+ tuning are selected.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper right",
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y)
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:HyperparameterTuningPlot.plot_pareto_front
+ atom.plots:HyperparameterTuningPlot.plot_timeline
+ atom.plots:HyperparameterTuningPlot.plot_trials
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run("RF", n_trials=10, ht_params={"cv": 5})
+ atom.plot_terminator_improvement()
+ ```
+
+ """
+ check_dependency("botorch")
+
+ models = check_hyperparams(models, "plot_terminator_improvement")
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ if m._ht["cv"] > 1:
+ info = self._memory.cache(_get_improvement_info)(m.study, get_error=True)
+ else:
+ raise ValueError(
+ "The plot_terminator_improvement method is only available for "
+ "models that ran hyperparameter tuning using cross-validation, "
+ "e.g. using ht_params={'cv': 5}."
+ )
+
+ fig.add_trace(
+ self._draw_line(
+ x=m.trials.index,
+ y=info.improvements,
+ error_y=dict(type="data", array=info.errors),
+ mode="markers+lines",
+ parent=m.name,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Trial",
+ ylabel="Terminator improvement",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_terminator_improvement",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model)
+ def plot_timeline(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the timeline of a study.
+
+ This plot is only available for models that ran
+ [hyperparameter tuning][].
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models that used hyperparameter
+ tuning are selected.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right",
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y)
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:HyperparameterTuningPlot.plot_edf
+ atom.plots:HyperparameterTuningPlot.plot_slice
+ atom.plots:HyperparameterTuningPlot.plot_terminator_improvement
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from optuna.pruners import PatientPruner
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(
+ models="LGB",
+ n_trials=15,
+ ht_params={"pruner": PatientPruner(None, patience=2)},
+ )
+ atom.plot_timeline()
+ ```
+
+ """
+ models = check_hyperparams(models, "plot_timeline")
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ _cm = {
+ "COMPLETE": BasePlot._fig._palette[0], # Main color
+ "FAIL": "rgb(255, 0, 0)", # Red
+ "PRUNED": "rgb(255, 165, 0)", # Orange
+ "RUNNING": "rgb(124, 252, 0)", # Green
+ "WAITING": "rgb(220, 220, 220)", # Gray
+ }
+
+ for m in models:
+ info = []
+ for trial in m.study.get_trials(deepcopy=False):
+ date_complete = trial.datetime_complete or datetime.now()
+ date_start = trial.datetime_start or date_complete
+
+ # Create nice representation of scores and params for hover
+ s = [f'{m}: {trial.values[i]}' for i, m in enumerate(self._metric.keys())]
+ p = [f" --> {k}: {v}" for k, v in trial.params.items()]
+
+ info.append(
+ Bunch(
+ number=trial.number,
+ start=date_start,
+ duration=1000 * (date_complete - date_start).total_seconds(),
+ state=trial.state,
+ hovertext=(
+ f"Trial: {trial.number}
"
+ f"{'
'.join(s)}"
+ f"Parameters:
{'
'.join(p)}"
+ )
+ )
+ )
+
+ for state in sorted(TrialState, key=lambda x: x.name):
+ if bars := list(filter(lambda x: x.state == state, info)):
+ fig.add_trace(
+ go.Bar(
+ name=state.name,
+ x=[b.duration for b in bars],
+ y=[b.number for b in bars],
+ base=[b.start.isoformat() for b in bars],
+ text=[b.hovertext for b in bars],
+ textposition="none",
+ hovertemplate=f"%{{text}}{m.name}",
+ orientation="h",
+ marker=dict(
+ color=f"rgba({_cm[state.name][4:-1]}, 0.2)",
+ line=dict(width=2, color=_cm[state.name]),
+ ),
+ showlegend=BasePlot._fig.showlegend(_cm[state.name], legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout({f"xaxis{yaxis[1:]}_type": "date", "barmode": "group"})
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Datetime",
+ ylabel="Trial",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_timeline",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model)
+ def plot_trials(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ metric: INT | str | SEQUENCE | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper left",
+ figsize: tuple[INT, INT] = (900, 800),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the hyperparameter tuning trials.
+
+ Creates a figure with two plots: the first plot shows the score
+ of every trial and the second shows the distance between the
+ last consecutive steps. The best trial is indicated with a star.
+ This is the same plot as produced by `ht_params={"plot": True}`.
+ This plot is only available for models that ran
+ [hyperparameter tuning][].
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models that used hyperparameter
+ tuning are selected.
+
+ metric: int, str, sequence or None, default=None
+ Metric to plot (only for multi-metric runs). Add `+` between
+ options to select more than one. If None, all metrics are
+ selected.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper left"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 800)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_evals
+ atom.plots:HyperparameterTuningPlot.plot_hyperparameters
+ atom.plots:PredictionPlot.plot_results
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["ET", "RF"], n_trials=15)
+ atom.plot_trials()
+ ```
+
+ """
+ models = check_hyperparams(models, "plot_trials")
+ metric = self._get_metric(metric, max_one=False)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes(y=(0.31, 1.0))
+ xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.0, 0.29))
+ for m in models:
+ for met in metric:
+ y = m.trials["score"].apply(lambda value: lst(value)[met])
+
+ # Create star symbol at best trial
+ symbols = ["circle"] * len(y)
+ symbols[m.best_trial.number] = "star"
+ sizes = [self.marker_size] * len(y)
+ sizes[m.best_trial.number] = self.marker_size * 1.5
+
+ fig.add_trace(
+ self._draw_line(
+ x=list(range(len(y))),
+ y=y,
+ mode="lines+markers",
+ marker_symbol=symbols,
+ marker_size=sizes,
+ hovertemplate=None,
+ parent=m.name,
+ child=self._metric[met].name,
+ legend=legend,
+ xaxis=xaxis2,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.add_trace(
+ self._draw_line(
+ x=list(range(1, len(y))),
+ y=np.abs(np.diff(y)),
+ mode="lines+markers",
+ marker_symbol="circle",
+ parent=m.name,
+ child=self._metric[met].name,
+ legend=legend,
+ xaxis=xaxis2,
+ yaxis=yaxis2,
+ )
+ )
+
+ fig.update_layout(
+ {
+ f"yaxis{yaxis[1:]}_anchor": f"x{xaxis2[1:]}",
+ f"xaxis{xaxis[1:]}_showticklabels": False,
+ "hovermode": "x unified",
+ },
+ )
+
+ self._plot(
+ ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"),
+ xlabel="Trial",
+ ylabel="d",
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ groupclick="togglegroup",
+ ylabel="Score",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_trials",
+ filename=filename,
+ display=display,
+ )
diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py
new file mode 100644
index 000000000..22ef8a691
--- /dev/null
+++ b/atom/plots/predictionplot.py
@@ -0,0 +1,3546 @@
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module containing the PredictionPlot class.
+
+"""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from functools import reduce
+from itertools import chain
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from joblib import Parallel, delayed
+from plotly.colors import unconvert_from_RGB_255, unlabel_rgb
+from scipy import stats
+from scipy.stats.mstats import mquantiles
+from sklearn.calibration import calibration_curve
+from sklearn.inspection import partial_dependence, permutation_importance
+from sklearn.metrics import (
+ confusion_matrix, det_curve, precision_recall_curve, roc_curve,
+)
+from sklearn.utils import _safe_indexing
+from sklearn.utils.metaestimators import available_if
+from sktime.forecasting.base import ForecastingHorizon
+from typeguard import typechecked
+
+from atom.plots.base import BasePlot
+from atom.utils.constants import PALETTE
+from atom.utils.types import (
+ FEATURES, FLOAT, INT, LEGEND, METRIC_SELECTOR, MODEL, SCALAR, SEQUENCE,
+ SLICE,
+)
+from atom.utils.utils import (
+ bk, check_canvas, check_dependency, check_predict_proba, composed, crash,
+ divide, get_best_score, get_custom_scorer, has_task, is_binary,
+ is_multioutput, lst, plot_from_model, rnd,
+)
+
+
+@typechecked
+class PredictionPlot(BasePlot):
+ """Prediction plots.
+
+ Plots that use the model's predictions. These plots are accessible
+ from the runners or from the models. If called from a runner, the
+ `models` parameter has to be specified (if None, uses all models).
+ If called from a model, that model is used and the `models` parameter
+ becomes unavailable.
+
+ """
+
+ @available_if(has_task(["binary", "multilabel"]))
+ @composed(crash, plot_from_model)
+ def plot_calibration(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str | SEQUENCE = "test",
+ n_bins: INT = 10,
+ target: INT | str = 0,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = "upper left",
+ figsize: tuple[INT, INT] = (900, 900),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the calibration curve for a binary classifier.
+
+ Well calibrated classifiers are probabilistic classifiers for
+ which the output of the `predict_proba` method can be directly
+ interpreted as a confidence level. For instance a well
+ calibrated (binary) classifier should classify the samples such
+ that among the samples to which it gave a `predict_proba` value
+ close to 0.8, approx. 80% actually belong to the positive class.
+ Read more in sklearn's [documentation][calibration].
+
+ This figure shows two plots: the calibration curve, where the
+ x-axis represents the average predicted probability in each bin
+ and the y-axis is the fraction of positives, i.e. the proportion
+ of samples whose class is the positive class (in each bin); and
+ a distribution of all predicted probabilities of the classifier.
+ This plot is available only for models with a `predict_proba`
+ method in a binary or [multilabel][] classification task.
+
+ !!! tip
+ Use the [calibrate][adaboost-calibrate] method to calibrate
+ the winning model.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str or sequence, default="test"
+ Data set on which to calculate the metric. Use a sequence
+ or add `+` between options to select more than one. Choose
+ from: "train", "test" or "holdout".
+
+ target: int or str, default=0
+ Target column to look at. Only for [multilabel][] tasks.
+
+ n_bins: int, default=10
+ Number of bins used for calibration. Minimum of 5 required.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper left"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 900)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_lift
+ atom.plots:PredictionPlot.plot_prc
+ atom.plots:PredictionPlot.plot_roc
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["RF", "LGB"])
+ atom.plot_calibration()
+ ```
+
+ """
+ check_predict_proba(models, "plot_calibration")
+ dataset = self._get_set(dataset, max_one=False)
+ target = self.branch._get_target(target, only_columns=True)
+
+ if n_bins < 5:
+ raise ValueError(
+ "Invalid value for the n_bins parameter."
+ f"Value should be >=5, got {n_bins}."
+ )
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes(y=(0.31, 1.0))
+ xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.0, 0.29))
+ for m in models:
+ for ds in dataset:
+ y_true, y_pred = m._get_pred(ds, target, attr="predict_proba")
+
+ # Get calibration (frac of positives and predicted values)
+ frac_pos, pred = calibration_curve(y_true, y_pred, n_bins=n_bins)
+
+ fig.add_trace(
+ self._draw_line(
+ x=pred,
+ y=frac_pos,
+ parent=m.name,
+ child=ds,
+ mode="lines+markers",
+ marker_symbol="circle",
+ legend=legend,
+ xaxis=xaxis2,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.add_trace(
+ go.Histogram(
+ x=y_pred,
+ xbins=dict(start=0, end=1, size=1. / n_bins),
+ marker=dict(
+ color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+ line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
+ ),
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=False,
+ xaxis=xaxis2,
+ yaxis=yaxis2,
+ )
+ )
+
+ self._draw_straight_line(y="diagonal", xaxis=xaxis2, yaxis=yaxis)
+
+ fig.update_layout(
+ {
+ f"yaxis{yaxis[1:]}_anchor": f"x{xaxis2[1:]}",
+ f"xaxis{xaxis2[1:]}_showgrid": True,
+ "barmode": "overlay",
+ }
+ )
+
+ self._plot(
+ ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"),
+ xlabel="Predicted value",
+ ylabel="Count",
+ xlim=(0, 1),
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ groupclick="togglegroup",
+ ylabel="Fraction of positives",
+ ylim=(-0.05, 1.05),
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_calibration",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task("class"))
+ @composed(crash, plot_from_model)
+ def plot_confusion_matrix(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str = "test",
+ target: INT | str = 0,
+ threshold: FLOAT = 0.5,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper right",
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot a model's confusion matrix.
+
+ For one model, the plot shows a heatmap. For multiple models,
+ it compares TP, FP, FN and TN in a barplot (not implemented
+ for multiclass classification tasks). This plot is available
+ only for classification tasks.
+
+ !!! tip
+ Fill the `threshold` parameter with the result from the
+ model's `get_best_threshold` method to optimize the results.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str, default="test"
+ Data set on which to calculate the confusion matrix. Choose
+ from:` "train", "test" or "holdout".
+
+ target: int or str, default=0
+ Target column to look at. Only for [multioutput tasks][].
+
+ threshold: float, default=0.5
+ Threshold between 0 and 1 to convert predicted probabilities
+ to class labels. Only for binary classification tasks.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the plot's type.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_calibration
+ atom.plots:PredictionPlot.plot_threshold
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, test_size=0.4)
+ atom.run(["LR", "RF"])
+ atom.lr.plot_confusion_matrix() # For one model
+ atom.plot_confusion_matrix() # For multiple models
+ ```
+
+ """
+ ds = self._get_set(dataset, max_one=True)
+ target = self.branch._get_target(target, only_columns=True)
+
+ if self.task.startswith("multiclass") and len(models) > 1:
+ raise NotImplementedError(
+ "The plot_confusion_matrix method does not support "
+ "the comparison of multiple models for multiclass "
+ "or multiclass-multioutput classification tasks."
+ )
+
+ labels = np.array(
+ (("True negatives", "False positives"), ("False negatives", "True positives"))
+ )
+
+ fig = self._get_figure()
+ if len(models) == 1:
+ xaxis, yaxis = BasePlot._fig.get_axes(
+ x=(0, 0.87),
+ coloraxis=dict(
+ colorscale="Blues",
+ cmin=0,
+ cmax=100,
+ title="Percentage of samples",
+ font_size=self.label_fontsize,
+ ),
+ )
+ else:
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ for m in models:
+ y_true, y_pred = m._get_pred(ds, target, attr="predict")
+ if threshold != 0.5:
+ y_pred = (y_pred > threshold).astype("int")
+
+ cm = confusion_matrix(y_true, y_pred)
+ if len(models) == 1: # Create matrix heatmap
+ ticks = m.mapping.get(target, np.unique(m.dataset[target]).astype(str))
+ xaxis, yaxis = BasePlot._fig.get_axes(
+ x=(0, 0.87),
+ coloraxis=dict(
+ colorscale="Blues",
+ cmin=0,
+ cmax=100,
+ title="Percentage of samples",
+ font_size=self.label_fontsize,
+ ),
+ )
+
+ fig.add_trace(
+ go.Heatmap(
+ x=ticks,
+ y=ticks,
+ z=100. * cm / cm.sum(axis=1)[:, np.newaxis],
+ coloraxis=f"coloraxis{xaxis[1:]}",
+ text=cm,
+ customdata=labels,
+ texttemplate="%{text}
(%{z:.2f}%)",
+ textfont=dict(size=self.label_fontsize),
+ hovertemplate=(
+ "%{customdata}
" if is_binary(self.task) else ""
+ "x:%{x}
y:%{y}
z:%{z}"
+ ),
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout(
+ {
+ "template": "plotly_white",
+ f"yaxis{yaxis[1:]}_autorange": "reversed",
+ f"xaxis{xaxis[1:]}_showgrid": False,
+ f"yaxis{yaxis[1:]}_showgrid": False,
+ }
+ )
+
+ else:
+ color = BasePlot._fig.get_elem(m.name)
+ fig.add_trace(
+ go.Bar(
+ x=cm.ravel(),
+ y=labels.ravel(),
+ orientation="h",
+ marker=dict(
+ color=f"rgba({color[4:-1]}, 0.2)",
+ line=dict(width=2, color=color),
+ ),
+ hovertemplate="%{x}",
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=BasePlot._fig.showlegend(m.name, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout(bargroupgap=0.05)
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Predicted label" if len(models) == 1 else "Count",
+ ylabel="True label" if len(models) == 1 else None,
+ title=title,
+ legend=legend,
+ figsize=figsize or ((800, 800) if len(models) == 1 else (900, 600)),
+ plotname="plot_confusion_matrix",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task(["binary", "multilabel"]))
+ @composed(crash, plot_from_model)
+ def plot_det(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str | SEQUENCE = "test",
+ target: INT | str = 0,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ):
+ """Plot the Detection Error Tradeoff curve.
+
+ Read more about [DET][] in sklearn's documentation. Only
+ available for binary classification tasks.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str or sequence, default="test"
+ Data set on which to calculate the metric. Use a sequence
+ or add `+` between options to select more than one. Choose
+ from: "train", "test" or "holdout".
+
+ target: int or str, default=0
+ Target column to look at. Only for [multilabel][] tasks.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_gains
+ atom.plots:PredictionPlot.plot_roc
+ atom.plots:PredictionPlot.plot_prc
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["LR", "RF"])
+ atom.plot_det()
+ ```
+
+ """
+ dataset = self._get_set(dataset, max_one=False)
+ target = self.branch._get_target(target, only_columns=True)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ for ds in dataset:
+ # Get fpr-fnr pairs for different thresholds
+ fpr, fnr, _ = det_curve(*m._get_pred(ds, target, attr="thresh"))
+
+ fig.add_trace(
+ self._draw_line(
+ x=fpr,
+ y=fnr,
+ mode="lines",
+ parent=m.name,
+ child=ds,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="FPR",
+ ylabel="FNR",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_det",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task("reg"))
+ @composed(crash, plot_from_model)
+ def plot_errors(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str = "test",
+ target: INT | str = 0,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot a model's prediction errors.
+
+ Plot the actual targets from a set against the predicted values
+ generated by the regressor. A linear fit is made on the data.
+ The gray, intersected line shows the identity line. This plot
+ can be useful to detect noise or heteroscedasticity along a
+ range of the target domain. This plot is available only for
+ regression tasks.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str, default="test"
+ Data set on which to calculate the metric. Choose from:
+ "train", "test" or "holdout".
+
+ target: int or str, default=0
+ Target column to look at. Only for [multioutput tasks][].
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_residuals
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import load_diabetes
+
+ X, y = load_diabetes(return_X_y=True, as_frame=True)
+
+ atom = ATOMRegressor(X, y)
+ atom.run(["OLS", "LGB"])
+ atom.plot_errors()
+ ```
+
+ """
+ ds = self._get_set(dataset, max_one=True)
+ target = self.branch._get_target(target, only_columns=True)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ y_true, y_pred = m._get_pred(ds, target)
+
+ fig.add_trace(
+ go.Scatter(
+ x=y_true,
+ y=y_pred,
+ mode="markers",
+ line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=BasePlot._fig.showlegend(m.name, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ # Fit the points using linear regression
+ from atom.models import OrdinaryLeastSquares
+ model = OrdinaryLeastSquares(goal=self.goal, branch=m.branch)._get_est()
+ model.fit(y_true.values.reshape(-1, 1), y_pred)
+
+ fig.add_trace(
+ go.Scatter(
+ x=(x := np.linspace(y_true.min(), y_true.max(), 100)),
+ y=model.predict(x[:, np.newaxis]),
+ mode="lines",
+ line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
+ hovertemplate="(%{x}, %{y})",
+ legendgroup=m.name,
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis)
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ groupclick="togglegroup",
+ xlabel="True value",
+ title=title,
+ legend=legend,
+ ylabel="Predicted value",
+ figsize=figsize,
+ plotname="plot_errors",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(ensembles=False))
+ def plot_evals(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str | SEQUENCE = "test",
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot evaluation curves.
+
+ The evaluation curves are the main metric scores achieved by the
+ models at every iteration of the training process. This plot is
+ available only for models that allow [in-training validation][].
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str or sequence, default="test"
+ Data set on which to calculate the evaluation curves. Use a
+ sequence or add `+` between options to select more than one.
+ Choose from: "train" or "test".
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:HyperparameterTuningPlot.plot_trials
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["XGB", "LGB"])
+ atom.plot_evals()
+ ```
+
+ """
+ dataset = self._get_set(dataset, max_one=False, allow_holdout=False)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ if not m.evals:
+ raise ValueError(
+ "Invalid value for the models parameter. Model "
+ f"{m.name} has no in-training validation."
+ )
+
+ for ds in dataset:
+ fig.add_trace(
+ self._draw_line(
+ x=list(range(len(m.evals[f"{self._metric[0].name}_{ds}"]))),
+ y=m.evals[f"{self._metric[0].name}_{ds}"],
+ marker_symbol="circle",
+ parent=m.name,
+ child=ds,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ BasePlot._fig.used_models.append(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Iterations",
+ ylabel=self._metric[0].name,
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_evals",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model)
+ def plot_feature_importance(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ show: INT | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot a model's feature importance.
+
+ The sum of importances for all features (per model) is 1.
+ This plot is available only for models whose estimator has
+ a `scores_`, `feature_importances_` or `coef` attribute.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ show: int or None, default=None
+ Number of features (ordered by importance) to show. If
+ None, it shows all features.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of features shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_parshap
+ atom.plots:PredictionPlot.plot_partial_dependence
+ atom.plots:PredictionPlot.plot_permutation_importance
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["LR", "RF"])
+ atom.plot_feature_importance(show=10)
+ ```
+
+ """
+ show = self._get_show(show, models)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ if (fi := m.feature_importance) is None:
+ raise ValueError(
+ "Invalid value for the models parameter. The estimator "
+ f"{m.estimator.__class__.__name__} has no feature_importances_ "
+ "nor coef_ attribute."
+ )
+
+ fig.add_trace(
+ go.Bar(
+ x=fi,
+ y=fi.index,
+ orientation="h",
+ marker=dict(
+ color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+ line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
+ ),
+ hovertemplate="%{x}",
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=BasePlot._fig.showlegend(m.name, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout(
+ {
+ f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
+ "bargroupgap": 0.05,
+ }
+ )
+
+ # Unique number of features over all branches
+ n_fxs = len(set([fx for m in models for fx in m.features]))
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Normalized feature importance",
+ ylim=(n_fxs - show - 0.5, n_fxs - 0.5),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ plotname="plot_feature_importance",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task("forecast"))
+ @composed(crash, plot_from_model(check_fitted=False))
+ def plot_forecast(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ fh: int | str | range | SEQUENCE | ForecastingHorizon = "test",
+ X: FEATURES | None = None,
+ target: INT | str = 0,
+ plot_interval: bool = True,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper left",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot a time series with model forecasts.
+
+ This plot is only available for forecasting tasks.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected. If no
+ models are selected, only the target column is plotted.
+
+ fh: int, str, range, sequence or [ForecastingHorizon][], default="test"
+ Forecast horizon for which to plot the predictions. If
+ string, choose from: "train", "test" or "holdout". Use a
+ sequence or add `+` between options to select more than one.
+
+ X: dataframe-like or None, default=None
+ Exogenous time series corresponding to fh. This parameter
+ is ignored if fh is a data set.
+
+ target: int or str, default=0
+ Target column to look at. Only for [multivariate][] tasks.
+
+ plot_interval: bool, default=True
+ Whether to plot prediction intervals instead of the exact
+ prediction values. If True, the plotted estimators should
+ have a `predict_interval` method.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper left"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_lift
+ atom.plots:PredictionPlot.plot_prc
+ atom.plots:PredictionPlot.plot_roc
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMForecaster
+ from sktime.datasets import load_airline
+
+ y = load_airline()
+
+ atom = ATOMForecaster(y, random_state=1)
+ atom.plot_forecast()
+ atom.run(
+ models="arima",
+ est_params={"order": (1, 1, 0), "seasonal_order": (0, 1, 0, 12)},
+ )
+ atom.plot_forecast()
+ atom.plot_forecast(fh="train+test", plot_interval=False)
+
+ # Forecast the next 4 years starting from the test set
+ atom.plot_forecast(fh=range(1, 48))
+ ```
+
+ """
+ target = self.branch._get_target(target, only_columns=True)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ # Draw original time series
+ for ds in ("train", "test"):
+ fig.add_trace(
+ go.Scatter(
+ x=self._get_plot_index(getattr(self, ds)),
+ y=getattr(self, ds)[target],
+ mode="lines+markers",
+ line=dict(
+ width=2,
+ color="black",
+ dash=BasePlot._fig.get_elem(ds, "dash"),
+ ),
+ opacity=0.6,
+ name=ds,
+ showlegend=False if models else BasePlot._fig.showlegend(ds, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ # Draw predictions
+ for m in models:
+ if isinstance(fh, str):
+ # Get fh and corresponding X from data set
+ datasets = self._get_set(fh, max_one=False)
+ fh = bk.concat([getattr(m, ds) for ds in datasets]).index
+ X = m.X.loc[fh]
+
+ y_pred = m.predict(fh, X)
+ if is_multioutput(self.task):
+ y_pred = y_pred[target]
+
+ fig.add_trace(
+ self._draw_line(
+ x=self._get_plot_index(y_pred),
+ y=y_pred,
+ mode="lines+markers",
+ parent=m.name,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ if plot_interval:
+ try:
+ y_pred = m.predict_interval(fh, X)
+ except NotImplementedError:
+ continue # Fails for some models like ES
+
+ if is_multioutput(self.task):
+ # Select interval of target column for multivariate
+ y = y_pred.iloc[:, y_pred.columns.get_loc(target)]
+ else:
+ y = y_pred # Univariate
+
+ fig.add_traces(
+ [
+ go.Scatter(
+ x=self._get_plot_index(y_pred),
+ y=y.iloc[:, 1],
+ mode="lines",
+ line=dict(width=1, color=BasePlot._fig.get_elem(m.name)),
+ hovertemplate=f"%{{y}}{m.name} - upper bound",
+ legendgroup=m.name,
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ ),
+ go.Scatter(
+ x=self._get_plot_index(y_pred),
+ y=y.iloc[:, 0],
+ mode="lines",
+ line=dict(width=1, color=BasePlot._fig.get_elem(m.name)),
+ fill="tonexty",
+ fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)",
+ hovertemplate=f"%{{y}}{m.name} - lower bound",
+ legendgroup=m.name,
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ ]
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ groupclick="togglegroup" if plot_interval else "toggleitem",
+ xlabel=self.y.index.name,
+ ylabel=target,
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_forecast",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task(["binary", "multilabel"]))
+ @composed(crash, plot_from_model)
+ def plot_gains(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str | SEQUENCE = "test",
+ target: INT | str = 0,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the cumulative gains curve.
+
+ This plot is available only for binary and [multilabel][]
+ classification tasks.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str or sequence, default="test"
+ Data set on which to calculate the metric. Use a sequence
+ or add `+` between options to select more than one. Choose
+ from: "train", "test" or "holdout".
+
+ target: int or str, default=0
+ Target column to look at. Only for [multilabel][] tasks.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_det
+ atom.plots:PredictionPlot.plot_lift
+ atom.plots:PredictionPlot.plot_roc
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["LR", "RF"])
+ atom.plot_gains()
+ ```
+
+ """
+ dataset = self._get_set(dataset, max_one=False)
+ target = self.branch._get_target(target, only_columns=True)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ for ds in dataset:
+ y_true, y_pred = m._get_pred(ds, target, attr="thresh")
+
+ fig.add_trace(
+ self._draw_line(
+ x=np.arange(start=1, stop=len(y_true) + 1) / len(y_true),
+ y=np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum(),
+ mode="lines",
+ parent=m.name,
+ child=ds,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis)
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Fraction of sample",
+ ylabel="Gain",
+ xlim=(0, 1),
+ ylim=(0, 1.02),
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_gains",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(ensembles=False))
+ def plot_learning_curve(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ metric: INT | str | SEQUENCE | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the learning curve: score vs number of training samples.
+
+ This plot is available only for models fitted using
+ [train sizing][]. [Ensembles][] are ignored.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ metric: int, str, sequence or None, default=None
+ Metric to plot (only for multi-metric runs). Use a sequence
+ or add `+` between options to select more than one. If None,
+ the metric used to run the pipeline is selected.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_results
+ atom.plots:PredictionPlot.plot_successive_halving
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.train_sizing(["LR", "RF"], n_bootstrap=5)
+ atom.plot_learning_curve()
+ ```
+
+ """
+ metric = self._get_metric(metric, max_one=False)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ for met in metric:
+ x, y, std = defaultdict(list), defaultdict(list), defaultdict(list)
+ for m in models:
+ x[m._group].append(m._train_idx)
+ y[m._group].append(get_best_score(m, met))
+ if m.bootstrap is not None:
+ std[m._group].append(m.bootstrap.iloc[:, met].std())
+
+ for group in x:
+ fig.add_trace(
+ self._draw_line(
+ x=x[group],
+ y=y[group],
+ mode="lines+markers",
+ marker_symbol="circle",
+ error_y=dict(type="data", array=std[group], visible=True),
+ parent=group,
+ child=self._metric[met].name,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ # Add error bands
+ if m.bootstrap is not None:
+ fillcolor = f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)"
+ fig.add_traces(
+ [
+ go.Scatter(
+ x=x[group],
+ y=np.add(y[group], std[group]),
+ mode="lines",
+ line=dict(width=1, color=BasePlot._fig.get_elem(group)),
+ hovertemplate="%{y}upper bound",
+ legendgroup=group,
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ ),
+ go.Scatter(
+ x=x[group],
+ y=np.subtract(y[group], std[group]),
+ mode="lines",
+ line=dict(width=1, color=BasePlot._fig.get_elem(group)),
+ fill="tonexty",
+ fillcolor=fillcolor,
+ hovertemplate="%{y}lower bound",
+ legendgroup=group,
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ ),
+ ]
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ groupclick="togglegroup",
+ title=title,
+ legend=legend,
+ xlabel="Number of training samples",
+ ylabel="Score",
+ figsize=figsize,
+ plotname="plot_learning_curve",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task(["binary", "multilabel"]))
+ @composed(crash, plot_from_model)
+ def plot_lift(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str | SEQUENCE = "test",
+ target: INT | str = 0,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the lift curve.
+
+ Only available for binary classification tasks.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str or sequence, default="test"
+ Data set on which to calculate the metric. Use a sequence
+ or add `+` between options to select more than one. Choose
+ from: "train", "test" or "holdout".
+
+ target: int or str, default=0
+ Target column to look at. Only for [multilabel][] tasks.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_det
+ atom.plots:PredictionPlot.plot_gains
+ atom.plots:PredictionPlot.plot_prc
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["LR", "RF"])
+ atom.plot_lift()
+ ```
+
+ """
+ dataset = self._get_set(dataset, max_one=False)
+ target = self.branch._get_target(target, only_columns=True)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ for ds in dataset:
+ y_true, y_pred = m._get_pred(ds, target, attr="thresh")
+
+ gains = np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum()
+ fig.add_trace(
+ self._draw_line(
+ x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)),
+ y=gains / x,
+ mode="lines",
+ parent=m.name,
+ child=ds,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ self._draw_straight_line(y=1, xaxis=xaxis, yaxis=yaxis)
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Fraction of sample",
+ ylabel="Lift",
+ xlim=(0, 1),
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_lift",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model)
+ def plot_parshap(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ columns: SLICE | None = None,
+ target: INT | str | tuple = 1,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper left",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the partial correlation of shap values.
+
+ Plots the train and test correlation between the shap value of
+ every feature with its target value, after removing the effect
+ of all other features (partial correlation). This plot is
+ useful to identify the features that are contributing most to
+ overfitting. Features that lie below the bisector (diagonal
+ line) performed worse on the test set than on the training set.
+ If the estimator has a `scores_`, `feature_importances_` or
+ `coef_` attribute, its normalized values are shown in a color
+ map.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ columns: int, str, slice, sequence or None, default=None
+ Features to plot. If None, it plots all features.
+
+ target: int, str or tuple, default=1
+ Class in the target column to target. For multioutput tasks,
+ the value should be a tuple of the form (column, class).
+ Note that for binary and multilabel tasks, the selected
+ class is always the positive one.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper left"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_feature_importance
+ atom.plots:PredictionPlot.plot_partial_dependence
+ atom.plots:PredictionPlot.plot_permutation_importance
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["GNB", "RF"])
+ atom.rf.plot_parshap(legend=None)
+ atom.plot_parshap(columns=slice(5, 10))
+ ```
+
+ """
+ target = self.branch._get_target(target)
+
+ fig = self._get_figure()
+
+ # Colorbar is only needed when a model has feature_importance
+ if all(m.feature_importance is None for m in models):
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ else:
+ xaxis, yaxis = BasePlot._fig.get_axes(
+ x=(0, 0.87),
+ coloraxis=dict(
+ colorscale="Reds",
+ title="Normalized feature importance",
+ font_size=self.label_fontsize,
+ )
+ )
+
+ for m in models:
+ parshap = {}
+ fxs = m.branch._get_columns(columns, include_target=False)
+
+ for ds in ("train", "test"):
+ # Calculating shap values is computationally expensive,
+ # therefore select a random subsample for large data sets
+ if len(data := getattr(m, ds)) > 500:
+ data = data.sample(500, random_state=self.random_state)
+
+ # Replace data with the calculated shap values
+ explanation = m._shap.get_explanation(data[m.features], target)
+ data[m.features] = explanation.values
+
+ parshap[ds] = pd.Series(index=fxs, dtype=float)
+ for fx in fxs:
+ # All other features are covariates
+ covariates = [f for f in data.columns[:-1] if f != fx]
+ cols = [fx, data.columns[-1], *covariates]
+
+ # Compute covariance
+ V = data[cols].cov()
+
+ # Inverse covariance matrix
+ Vi = np.linalg.pinv(V, hermitian=True)
+ diag = Vi.diagonal()
+
+ D = np.diag(np.sqrt(1 / diag))
+
+ # Partial correlation matrix
+ partial_corr = -1 * (D @ Vi @ D) # @ is matrix multiplication
+
+ # Semi-partial correlation matrix
+ with np.errstate(divide="ignore"):
+ V_sqrt = np.sqrt(np.diag(V))[..., None]
+ Vi_sqrt = np.sqrt(np.abs(diag - Vi ** 2 / diag[..., None])).T
+ semi_partial_correlation = partial_corr / V_sqrt / Vi_sqrt
+
+ # X covariates are removed
+ parshap[ds][fx] = semi_partial_correlation[1, 0]
+
+ # Get the feature importance or coefficients
+ if m.feature_importance is not None:
+ color = m.feature_importance.loc[fxs]
+ else:
+ color = BasePlot._fig.get_elem("parshap")
+
+ fig.add_trace(
+ go.Scatter(
+ x=parshap["train"],
+ y=parshap["test"],
+ mode="markers+text",
+ marker=dict(
+ color=color,
+ size=self.marker_size,
+ coloraxis=f"coloraxis{xaxis[1:]}",
+ line=dict(width=1, color="rgba(255, 255, 255, 0.9)"),
+ ),
+ text=m.features,
+ textposition="top center",
+ customdata=(data := None if isinstance(color, str) else list(color)),
+ hovertemplate=(
+ f"%{{text}}
(%{{x}}, %{{y}})"
+ f"{'
Feature importance: %{customdata:.4f}' if data else ''}"
+ f"{m.name}"
+ ),
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=BasePlot._fig.showlegend(m.name, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis)
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Training set",
+ ylabel="Test set",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_parshap",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model)
+ def plot_partial_dependence(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ columns: SLICE | None = None,
+ kind: str | SEQUENCE = "average",
+ pair: int | str | None = None,
+ target: INT | str = 1,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the partial dependence of features.
+
+ The partial dependence of a feature (or a set of features)
+ corresponds to the response of the model for each possible
+ value of the feature. The plot can take two forms:
+
+ - If `pair` is None: Single feature partial dependence lines.
+ The deciles of the feature values are shown with tick marks
+ on the bottom.
+ - If `pair` is defined: Two-way partial dependence plots are
+ plotted as contour plots (only allowed for a single model).
+
+ Read more about partial dependence on sklearn's
+ [documentation][partial_dependence]. This plot is not available
+ for multilabel nor multiclass-multioutput classification tasks.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ columns: int, str, slice, sequence or None, default=None
+ Features to get the partial dependence from. If None, it
+ uses the first 3 features in the dataset.
+
+ kind: str or sequence, default="average"
+ Kind of depedence to plot. Use a sequence or add `+` between
+ options to select more than one. Choose from:
+
+ - "average": Partial dependence averaged across all samples
+ in the dataset.
+ - "individual": Partial dependence for up to 50 random
+ samples (Individual Conditional Expectation).
+
+ This parameter is ignored when plotting feature pairs.
+
+ pair: int, str or None, default=None
+ Feature with which to pair the features selected by
+ `columns`. If specified, the resulting figure displays
+ contour plots. Only allowed when plotting a single model.
+ If None, the plots show the partial dependece of single
+ features.
+
+ target: int or str, default=1
+ Class in the target column to look at (only for multiclass
+ classification tasks).
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_feature_importance
+ atom.plots:PredictionPlot.plot_parshap
+ atom.plots:PredictionPlot.plot_permutation_importance
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["LR", "RF"])
+ atom.plot_partial_dependence(kind="average+individual", legend="upper left")
+ atom.rf.plot_partial_dependence(columns=(3, 4), pair=2)
+ ```
+
+ """
+ if any(self.task.startswith(t) for t in ("multilabel", "multiclass-multioutput")):
+ raise PermissionError(
+ "The plot_partial_dependence method is not available for multilabel "
+ f"nor multiclass-multioutput classification tasks, got {self.task}."
+ )
+ elif self.task.startswith("multiclass"):
+ _, target = self.branch._get_target(target)
+ else:
+ target = 0
+
+ kind = "+".join(lst(kind)).lower()
+ if any(k not in ("average", "individual") for k in kind.split("+")):
+ raise ValueError(
+ f"Invalid value for the kind parameter, got {kind}. "
+ "Choose from: average, individual."
+ )
+
+ axes, names = [], []
+ fig = self._get_figure()
+ for m in models:
+ color = BasePlot._fig.get_elem(m.name)
+
+ # Since every model can have different fxs, select them
+ # every time and make sure the models use the same fxs
+ cols = m.branch._get_columns(
+ columns=(0, 1, 2) if columns is None else columns,
+ include_target=False,
+ )
+
+ if not names:
+ names = cols
+ elif names != cols:
+ raise ValueError(
+ "Invalid value for the columns parameter. Not all "
+ f"models use the same features, got {names} and {cols}."
+ )
+
+ if pair is not None:
+ if len(models) > 1:
+ raise ValueError(
+ f"Invalid value for the pair parameter, got {pair}. "
+ "The value must be None when plotting multiple models"
+ )
+ else:
+ pair = m.branch._get_columns(pair, include_target=False)
+ cols = [(c, pair[0]) for c in cols]
+ else:
+ cols = [(c,) for c in cols]
+
+ # Create new axes
+ if not axes:
+ for i, col in enumerate(cols):
+ # Calculate the distance between subplots
+ offset = divide(0.025, len(cols) - 1)
+
+ # Calculate the size of the subplot
+ size = (1 - ((offset * 2) * (len(cols) - 1))) / len(cols)
+
+ # Determine the position for the axes
+ x_pos = i % len(cols) * (size + 2 * offset)
+
+ xaxis, yaxis = BasePlot._fig.get_axes(x=(x_pos, rnd(x_pos + size)))
+ axes.append((xaxis, yaxis))
+
+ # Compute averaged predictions
+ predictions = Parallel(n_jobs=self.n_jobs, backend=self.backend)(
+ delayed(partial_dependence)(
+ estimator=m.estimator,
+ X=m.X_test,
+ features=col,
+ kind="both" if "individual" in kind else "average",
+ ) for col in cols
+ )
+
+ # Compute deciles for ticks (only if line plots)
+ if len(cols[0]) == 1:
+ deciles = {}
+ for fx in chain.from_iterable(cols):
+ if fx not in deciles: # Skip if the feature is repeated
+ X_col = _safe_indexing(m.X_test, fx, axis=1)
+ deciles[fx] = mquantiles(X_col, prob=np.arange(0.1, 1.0, 0.1))
+
+ for i, (ax, fx, pred) in enumerate(zip(axes, cols, predictions)):
+ # Draw line or contour plot
+ if len(pred["values"]) == 1:
+ # For both average and individual: draw ticks on the horizontal axis
+ for line in deciles[fx[0]]:
+ fig.add_shape(
+ type="line",
+ x0=line,
+ x1=line,
+ xref=ax[0],
+ y0=0,
+ y1=0.05,
+ yref=f"{axes[0][1]} domain",
+ line=dict(width=1, color=BasePlot._fig.get_elem(m.name)),
+ opacity=0.6,
+ layer="below",
+ )
+
+ # Draw the mean of the individual lines
+ if "average" in kind:
+ fig.add_trace(
+ go.Scatter(
+ x=pred["values"][0],
+ y=pred["average"][target].ravel(),
+ mode="lines",
+ line=dict(width=2, color=color),
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=BasePlot._fig.showlegend(m.name, legend),
+ xaxis=ax[0],
+ yaxis=axes[0][1],
+ )
+ )
+
+ # Draw all individual (per sample) lines (ICE)
+ if "individual" in kind:
+ # Select up to 50 random samples to plot
+ idx = np.random.choice(
+ list(range(len(pred["individual"][target]))),
+ size=min(len(pred["individual"][target]), 50),
+ replace=False,
+ )
+ for sample in pred["individual"][target, idx, :]:
+ fig.add_trace(
+ go.Scatter(
+ x=pred["values"][0],
+ y=sample,
+ mode="lines",
+ line=dict(width=0.5, color=color),
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=BasePlot._fig.showlegend(m.name, legend),
+ xaxis=ax[0],
+ yaxis=axes[0][1],
+ )
+ )
+
+ else:
+ colorscale = PALETTE.get(BasePlot._fig.get_elem(m.name), "Teal")
+ fig.add_trace(
+ go.Contour(
+ x=pred["values"][0],
+ y=pred["values"][1],
+ z=pred["average"][target],
+ contours=dict(
+ showlabels=True,
+ labelfont=dict(size=self.tick_fontsize, color="white")
+ ),
+ hovertemplate="x:%{x}
y:%{y}
z:%{z}",
+ hoverongaps=False,
+ colorscale=colorscale,
+ showscale=False,
+ showlegend=False,
+ xaxis=ax[0],
+ yaxis=axes[0][1],
+ )
+ )
+
+ self._plot(
+ ax=(f"xaxis{ax[0][1:]}", f"yaxis{ax[1][1:]}"),
+ xlabel=fx[0],
+ ylabel=(fx[1] if len(fx) > 1 else "Score") if i == 0 else None,
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ groupclick="togglegroup",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_partial_dependence",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model)
+ def plot_permutation_importance(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ show: INT | None = None,
+ n_repeats: INT = 10,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the feature permutation importance of models.
+
+ !!! warning
+ This method can be slow. Results are cached to fasten
+ repeated calls.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ show: int or None, default=None
+ Number of features (ordered by importance) to show. If
+ None, it shows all features.
+
+ n_repeats: int, default=10
+ Number of times to permute each feature.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of features shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_feature_importance
+ atom.plots:PredictionPlot.plot_partial_dependence
+ atom.plots:PredictionPlot.plot_parshap
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["LR", "RF"])
+ atom.plot_permutation_importance(show=10, n_repeats=7)
+ ```
+
+ """
+ show = self._get_show(show, models)
+
+ if n_repeats <= 0:
+ raise ValueError(
+ "Invalid value for the n_repeats parameter."
+ f"Value should be >0, got {n_repeats}."
+ )
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ for m in models:
+ # Permutation importances returns Bunch object
+ permutations = self._memory.cache(permutation_importance)(
+ estimator=m.estimator,
+ X=m.X_test,
+ y=m.y_test,
+ scoring=self._metric[0],
+ n_repeats=n_repeats,
+ n_jobs=self.n_jobs,
+ random_state=self.random_state,
+ )
+
+ fig.add_trace(
+ go.Box(
+ x=permutations["importances"].ravel(),
+ y=list(np.array([[fx] * n_repeats for fx in m.features]).ravel()),
+ marker_color=BasePlot._fig.get_elem(m.name),
+ boxpoints="outliers",
+ orientation="h",
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=BasePlot._fig.showlegend(m.name, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout(
+ {
+ f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
+ "boxmode": "group",
+ }
+ )
+
+ # Unique number of features over all branches
+ n_fxs = len(set([fx for m in models for fx in m.features]))
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Score",
+ ylim=(n_fxs - show - 0.5, n_fxs - 0.5),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ plotname="plot_permutation_importance",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(check_fitted=False))
+ def plot_pipeline(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ draw_hyperparameter_tuning: bool = True,
+ color_branches: bool | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> plt.Figure | None:
+ """Plot a diagram of the pipeline.
+
+ !!! warning
+ This plot uses the [schemdraw][] package, which is
+ incompatible with [plotly][]. The returned plot is
+ therefore a [matplotlib figure][pltfigure].
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models for which to draw the pipeline. If None, all
+ pipelines are plotted.
+
+ draw_hyperparameter_tuning: bool, default=True
+ Whether to draw if the models used Hyperparameter Tuning.
+
+ color_branches: bool or None, default=None
+ Whether to draw every branch in a different color. If None,
+ branches are colored when there is more than one.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the pipeline drawn.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as png. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [plt.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:DataPlot.plot_wordcloud
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["GNB", "RNN", "SGD", "MLP"])
+ atom.voting(models=atom.winners[:2])
+ atom.plot_pipeline()
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.scale()
+ atom.prune()
+ atom.run("RF", n_trials=30)
+
+ atom.branch = "undersample"
+ atom.balance("nearmiss")
+ atom.run("RF_undersample")
+
+ atom.branch = "oversample_from_master"
+ atom.balance("smote")
+ atom.run("RF_oversample")
+
+ atom.plot_pipeline()
+ ```
+
+ """
+
+ def get_length(pl, i):
+ """Get the maximum length of the name of a block."""
+ if len(pl) > i:
+ return max(len(pl[i].__class__.__name__) * 0.5, 7)
+ else:
+ return 0
+
+ def check_y(xy):
+ """Return y unless there is something right, then jump."""
+ while any(pos[0] > xy[0] and pos[1] == xy[1] for pos in positions.values()):
+ xy = Point((xy[0], xy[1] + height))
+
+ return xy[1]
+
+ def add_wire(x, y):
+ """Draw a connecting wire between two estimators."""
+ d.add(
+ Wire(shape="z", k=(x - d.here[0]) / (length + 1), arrow="->")
+ .to((x, y))
+ .color(branch["color"])
+ )
+
+ # Update arrowhead manually
+ d.elements[-1].segments[-1].arrowwidth = 0.3
+ d.elements[-1].segments[-1].arrowlength = 0.5
+
+ check_dependency("schemdraw")
+ from schemdraw import Drawing
+ from schemdraw.flow import Data, RoundBox, Subroutine, Wire
+ from schemdraw.util import Point
+
+ fig = self._get_figure(backend="matplotlib")
+ check_canvas(BasePlot._fig.is_canvas, "plot_pipeline")
+
+ # Define branches to plot (if called from model, it's only one)
+ branches = []
+ for branch in getattr(self, "_branches", [self.branch]):
+ draw_models, draw_ensembles = [], []
+ for m in models:
+ if m.branch is branch:
+ if m.acronym not in ("Stack", "Vote"):
+ draw_models.append(m)
+ else:
+ draw_ensembles.append(m)
+
+ # Additionally, add all dependent models (if not already there)
+ draw_models.extend([i for i in m._models if i not in draw_models])
+
+ if not models or draw_models:
+ branches.append(
+ {
+ "name": branch.name,
+ "pipeline": list(branch.pipeline),
+ "models": draw_models,
+ "ensembles": draw_ensembles,
+ }
+ )
+
+ # Define colors per branch
+ for branch in branches:
+ if color_branches or (color_branches is None and len(branches) > 1):
+ color = next(BasePlot._fig.palette)
+
+ # Convert back to format accepted by matplotlib
+ branch["color"] = unconvert_from_RGB_255(unlabel_rgb(color))
+ else:
+ branch["color"] = "black"
+
+ # Create schematic drawing
+ d = Drawing(unit=1, backend="matplotlib")
+ d.config(fontsize=self.tick_fontsize)
+ d.add(Subroutine(w=8, s=0.7).label("Raw data"))
+
+ height = 3 # Height of every block
+ length = 5 # Minimum arrow length
+
+ # Define the x-position for every block
+ x_pos = [d.here[0] + length]
+ for i in range(max(len(b["pipeline"]) for b in branches)):
+ len_block = reduce(max, [get_length(b["pipeline"], i) for b in branches])
+ x_pos.append(x_pos[-1] + length + len_block)
+
+ # Add positions for scaling, hyperparameter tuning and models
+ x_pos.extend([x_pos[-1], x_pos[-1]])
+ if any(m.scaler for m in models):
+ x_pos[-1] = x_pos[-2] = x_pos[-3] + length + 7
+ if draw_hyperparameter_tuning and any(m.trials is not None for m in models):
+ x_pos[-1] = x_pos[-2] + length + 11
+
+ positions = {0: d.here} # Contains the position of every element
+ for branch in branches:
+ d.here = positions[0]
+
+ for i, est in enumerate(branch["pipeline"]):
+ # If the estimator has already been seen, don't draw
+ if id(est) in positions:
+ # Change location to estimator's end
+ d.here = positions[id(est)]
+ continue
+
+ # Draw transformer
+ add_wire(x_pos[i], check_y(d.here))
+ d.add(
+ RoundBox(w=max(len(est.__class__.__name__) * 0.5, 7))
+ .label(est.__class__.__name__, color="k")
+ .color(branch["color"])
+ .anchor("W")
+ .drop("E")
+ )
+
+ positions[id(est)] = d.here
+
+ for model in branch["models"]:
+ # Position at last transformer or at start
+ if branch["pipeline"]:
+ d.here = positions[id(est)]
+ else:
+ d.here = positions[0]
+
+ # For a single branch, center models
+ if len(branches) == 1:
+ offset = height * (len(branch["models"]) - 1) / 2
+ else:
+ offset = 0
+
+ # Draw automated feature scaling
+ if model.scaler:
+ add_wire(x_pos[-3], check_y((d.here[0], d.here[1] - offset)))
+ d.add(
+ RoundBox(w=7)
+ .label("Scaler", color="k")
+ .color(branch["color"])
+ .drop("E")
+ )
+ offset = 0
+
+ # Draw hyperparameter tuning
+ if draw_hyperparameter_tuning and model.trials is not None:
+ add_wire(x_pos[-2], check_y((d.here[0], d.here[1] - offset)))
+ d.add(
+ Data(w=11)
+ .label("Hyperparameter\nTuning", color="k")
+ .color(branch["color"])
+ .drop("E")
+ )
+ offset = 0
+
+ # Remove classifier/regressor from model's name
+ name = model.estimator.__class__.__name__
+ if name.lower().endswith("classifier"):
+ name = name[:-10]
+ elif name.lower().endswith("regressor"):
+ name = name[:-9]
+
+ # Draw model
+ add_wire(x_pos[-1], check_y((d.here[0], d.here[1] - offset)))
+ d.add(
+ Data(w=max(len(name) * 0.5, 7))
+ .label(name, color="k")
+ .color(branch["color"])
+ .anchor("W")
+ .drop("E")
+ )
+
+ positions[id(model)] = d.here
+
+ # Draw ensembles
+ max_pos = max(pos[0] for pos in positions.values()) # Max length model names
+ for branch in branches:
+ for model in branch["ensembles"]:
+ # Determine y-position of the ensemble
+ y_pos = [positions[id(m)][1] for m in model._models]
+ offset = height / 2 * (len(branch["ensembles"]) - 1)
+ y = min(y_pos) + (max(y_pos) - min(y_pos)) * 0.5 - offset
+ y = check_y((max_pos + length, max(min(y_pos), y)))
+
+ d.here = (max_pos + length, y)
+
+ d.add(
+ Data(w=max(len(model._fullname) * 0.5, 7))
+ .label(model._fullname, color="k")
+ .color(branch["color"])
+ .anchor("W")
+ .drop("E")
+ )
+
+ positions[id(model)] = d.here
+
+ # Draw a wire from every model to the ensemble
+ for m in model._models:
+ d.here = positions[id(m)]
+ add_wire(max_pos + length, y)
+
+ if not figsize:
+ dpi, bbox = fig.get_dpi(), d.get_bbox()
+ figsize = (dpi * bbox.xmax // 4, (dpi / 2) * (bbox.ymax - bbox.ymin))
+
+ d.draw(canvas=plt.gca(), showframe=False, show=False)
+ plt.axis("off")
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=plt.gca(),
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_pipeline",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task(["binary", "multilabel"]))
+ @composed(crash, plot_from_model)
+ def plot_prc(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str | SEQUENCE = "test",
+ target: INT | str = 0,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower left",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the precision-recall curve.
+
+ Read more about [PRC][] in sklearn's documentation. Only
+ available for binary classification tasks.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str or sequence, default="test"
+ Data set on which to calculate the metric. Use a sequence
+ or add `+` between options to select more than one. Choose
+ from: "train", "test" or "holdout".
+
+ target: int or str, default=0
+ Target column to look at. Only for [multilabel][] tasks.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower left"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_det
+ atom.plots:PredictionPlot.plot_lift
+ atom.plots:PredictionPlot.plot_roc
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["LR", "RF"])
+ atom.plot_prc()
+ ```
+
+ """
+ dataset = self._get_set(dataset, max_one=False)
+ target = self.branch._get_target(target, only_columns=True)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ for ds in dataset:
+ y_true, y_pred = m._get_pred(ds, target, attr="thresh")
+
+ # Get precision-recall pairs for different thresholds
+ prec, rec, _ = precision_recall_curve(y_true, y_pred)
+
+ fig.add_trace(
+ self._draw_line(
+ x=rec,
+ y=prec,
+ mode="lines",
+ parent=m.name,
+ child=ds,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ self._draw_straight_line(sum(m.y_test) / len(m.y_test), xaxis=xaxis, yaxis=yaxis)
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Recall",
+ ylabel="Precision",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_prc",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task("class"))
+ @composed(crash, plot_from_model)
+ def plot_probabilities(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str = "test",
+ target: INT | str | tuple = 1,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the probability distribution of the target classes.
+
+ This plot is available only for models with a `predict_proba`
+ method in classification tasks.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str, default="test"
+ Data set on which to calculate the metric. Choose from:
+ "train", "test" or "holdout".
+
+ target: int, str or tuple, default=1
+ Probability of being that class in the target column. For
+ multioutput tasks, the value should be a tuple of the form
+ (column, class).
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_confusion_matrix
+ atom.plots:PredictionPlot.plot_results
+ atom.plots:PredictionPlot.plot_threshold
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["LR", "RF"])
+ atom.plot_probabilities()
+ ```
+
+ """
+ check_predict_proba(models, "plot_probabilities")
+ ds = self._get_set(dataset, max_one=True)
+ col, cls = self.branch._get_target(target)
+ col = lst(self.target)[col]
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ y_true, y_pred = getattr(m, f"y_{ds}"), getattr(m, f"predict_proba_{ds}")
+ for value in np.unique(m.dataset[col]):
+ # Get indices per class
+ if is_multioutput(self.task):
+ if self.task.startswith("multilabel"):
+ hist = y_pred.loc[y_true[col] == value, col]
+ else:
+ hist = y_pred.loc[cls, col].loc[y_true[col] == value]
+ else:
+ hist = y_pred.loc[y_true == value, str(cls)]
+
+ fig.add_trace(
+ go.Scatter(
+ x=(x := np.linspace(0, 1, 100)),
+ y=stats.gaussian_kde(hist)(x),
+ mode="lines",
+ line=dict(
+ width=2,
+ color=BasePlot._fig.get_elem(m.name),
+ dash=BasePlot._fig.get_elem(ds, "dash"),
+ ),
+ fill="tonexty",
+ fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)",
+ fillpattern=dict(shape=BasePlot._fig.get_elem(value, "shape")),
+ name=f"{col}={value}",
+ legendgroup=m.name,
+ legendgrouptitle=dict(text=m.name, font_size=self.label_fontsize),
+ showlegend=BasePlot._fig.showlegend(f"{m.name}-{value}", legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ groupclick="toggleitem",
+ xlabel="Probability",
+ ylabel="Probability density",
+ xlim=(0, 1),
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_probabilities",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task("reg"))
+ @composed(crash, plot_from_model)
+ def plot_residuals(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str = "test",
+ target: INT | str = 0,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "upper left",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot a model's residuals.
+
+ The plot shows the residuals (difference between the predicted
+ and the true value) on the vertical axis and the independent
+ variable on the horizontal axis. The gray, intersected line
+ shows the identity line. This plot can be useful to analyze the
+ variance of the error of the regressor. If the points are
+ randomly dispersed around the horizontal axis, a linear
+ regression model is appropriate for the data; otherwise, a
+ non-linear model is more appropriate. This plot is only
+ available for regression tasks.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str, default="test"
+ Data set on which to calculate the metric. Choose from:
+ "train", "test" or "holdout".
+
+ target: int or str, default=0
+ Target column to look at. Only for [multioutput tasks][].
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="upper left"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_errors
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMRegressor
+ from sklearn.datasets import load_diabetes
+
+ X, y = load_diabetes(return_X_y=True, as_frame=True)
+
+ atom = ATOMRegressor(X, y)
+ atom.run(["OLS", "LGB"])
+ atom.plot_residuals()
+ ```
+
+ """
+ ds = self._get_set(dataset, max_one=True)
+ target = self.branch._get_target(target, only_columns=True)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes(x=(0, 0.69))
+ xaxis2, yaxis2 = BasePlot._fig.get_axes(x=(0.71, 1.0))
+ for m in models:
+ y_true, y_pred = m._get_pred(ds, target)
+
+ fig.add_trace(
+ go.Scatter(
+ x=y_true,
+ y=(res := np.subtract(y_true, y_pred)),
+ mode="markers",
+ line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=BasePlot._fig.showlegend(m.name, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.add_trace(
+ go.Histogram(
+ y=res,
+ bingroup="residuals",
+ marker=dict(
+ color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+ line=dict(width=2, color=BasePlot._fig.get_elem(m.name)),
+ ),
+ name=m.name,
+ legendgroup=m.name,
+ showlegend=False,
+ xaxis=xaxis2,
+ yaxis=yaxis,
+ )
+ )
+
+ self._draw_straight_line(y=0, xaxis=xaxis, yaxis=yaxis)
+
+ fig.update_layout({f"yaxis{xaxis[1:]}_showgrid": True, "barmode": "overlay"})
+
+ self._plot(
+ ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"),
+ xlabel="Distribution",
+ title=title,
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ groupclick="togglegroup",
+ ylabel="Residuals",
+ xlabel="True value",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_residuals",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model)
+ def plot_results(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ metric: INT | str | SEQUENCE | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the model results.
+
+ If all models applied bootstrap, the plot is a boxplot. If
+ not, the plot is a barplot. Models are ordered based on
+ their score from the top down. The score is either the
+ `score_bootstrap` or `score_test` attribute of the model,
+ selected in that order.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ metric: int, str, sequence or None, default=None
+ Metric to plot (only for multi-metric runs). Other available
+ options are "time_bo", "time_fit", "time_bootstrap" and
+ "time". If str, add `+` between options to select more than
+ one. If None, the metric used to run the pipeline is selected.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of models.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_confusion_matrix
+ atom.plots:PredictionPlot.plot_probabilities
+ atom.plots:PredictionPlot.plot_threshold
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["GNB", "LR", "RF", "LGB"], metric=["f1", "recall"])
+ atom.plot_results()
+
+ atom.run(["GNB", "LR", "RF", "LGB"], metric=["f1", "recall"], n_bootstrap=5)
+ atom.plot_results()
+ atom.plot_results(metric="time_fit+time")
+ ```
+
+ """
+
+ def get_std(model: MODEL, metric: int) -> SCALAR:
+ """Get the standard deviation of the bootstrap scores.
+
+ Parameters
+ ----------
+ model: Model
+ Model to get the std from.
+
+ metric: int
+ Index of the metric to get it from.
+
+ Returns
+ -------
+ int or float
+ Standard deviation score or 0 if not bootstrapped.
+
+ """
+ if model.bootstrap is None:
+ return 0
+ else:
+ return model.bootstrap.iloc[:, metric].std()
+
+ metric = self._get_metric(metric, max_one=False)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ for met in metric:
+ if isinstance(met, str):
+ color = BasePlot._fig.get_elem(met)
+ fig.add_trace(
+ go.Bar(
+ x=[getattr(m, met) for m in models],
+ y=[m.name for m in models],
+ orientation="h",
+ marker=dict(
+ color=f"rgba({color[4:-1]}, 0.2)",
+ line=dict(width=2, color=color),
+ ),
+ hovertemplate=f"%{{x}}{met}",
+ name=met,
+ legendgroup=met,
+ showlegend=BasePlot._fig.showlegend(met, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+ else:
+ name = self._metric[met].name
+ color = BasePlot._fig.get_elem()
+
+ if all(m.score_bootstrap for m in models):
+ x = np.array([m.bootstrap.iloc[:, met] for m in models]).ravel()
+ y = np.array([[m.name] * len(m.bootstrap) for m in models]).ravel()
+ fig.add_trace(
+ go.Box(
+ x=x,
+ y=list(y),
+ marker_color=color,
+ boxpoints="outliers",
+ orientation="h",
+ name=name,
+ legendgroup=name,
+ showlegend=BasePlot._fig.showlegend(name, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+ else:
+ fig.add_trace(
+ go.Bar(
+ x=[get_best_score(m, met) for m in models],
+ y=[m.name for m in models],
+ error_x=dict(
+ type="data",
+ array=[get_std(m, met) for m in models],
+ ),
+ orientation="h",
+ marker=dict(
+ color=f"rgba({color[4:-1]}, 0.2)",
+ line=dict(width=2, color=color),
+ ),
+ hovertemplate="%{x}",
+ name=name,
+ legendgroup=name,
+ showlegend=BasePlot._fig.showlegend(name, legend),
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ fig.update_layout(
+ {
+ f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"),
+ "bargroupgap": 0.05,
+ "boxmode": "group",
+ }
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="time (s)" if all(isinstance(m, str) for m in metric) else "Score",
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + len(models) * 50),
+ plotname="plot_results",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task(["binary", "multilabel"]))
+ @composed(crash, plot_from_model)
+ def plot_roc(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ dataset: str | SEQUENCE = "test",
+ target: INT | str = 0,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot the Receiver Operating Characteristics curve.
+
+ Read more about [ROC][] in sklearn's documentation. Only
+ available for classification tasks.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ dataset: str or sequence, default="test"
+ Data set on which to calculate the metric. Use a sequence
+ or add `+` between options to select more than one. Choose
+ from: "train", "test" or "holdout".
+
+ target: int or str, default=0
+ Target column to look at. Only for [multilabel][] tasks.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_gains
+ atom.plots:PredictionPlot.plot_lift
+ atom.plots:PredictionPlot.plot_prc
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["LR", "RF"])
+ atom.plot_roc()
+ ```
+
+ """
+ dataset = self._get_set(dataset, max_one=False)
+ target = self.branch._get_target(target, only_columns=True)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+ for m in models:
+ for ds in dataset:
+ # Get False (True) Positive Rate as arrays
+ fpr, tpr, _ = roc_curve(*m._get_pred(ds, target, attr="thresh"))
+
+ fig.add_trace(
+ self._draw_line(
+ x=fpr,
+ y=tpr,
+ mode="lines",
+ parent=m.name,
+ child=ds,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis)
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlim=(-0.03, 1.03),
+ ylim=(-0.03, 1.03),
+ xlabel="FPR",
+ ylabel="TPR",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_roc",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(ensembles=False))
+ def plot_successive_halving(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ metric: INT | str | SEQUENCE | None = None,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower right",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot scores per iteration of the successive halving.
+
+ Only use with models fitted using [successive halving][].
+ [Ensembles][] are ignored.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ metric: int, str, sequence or None, default=None
+ Metric to plot (only for multi-metric runs). Use a sequence
+ or add `+` between options to select more than one. If None,
+ the metric used to run the pipeline is selected.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower right"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_learning_curve
+ atom.plots:PredictionPlot.plot_results
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.successive_halving(["Tree", "Bag", "RF", "LGB"], n_bootstrap=5)
+ atom.plot_successive_halving()
+ ```
+
+ """
+ metric = self._get_metric(metric, max_one=False)
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ for met in metric:
+ x, y, std = defaultdict(list), defaultdict(list), defaultdict(list)
+ for m in models:
+ x[m._group].append(len(m.branch._idx[1]) // m._train_idx)
+ y[m._group].append(get_best_score(m, met))
+ if m.bootstrap is not None:
+ std[m._group].append(m.bootstrap.iloc[:, met].std())
+
+ for group in x:
+ fig.add_trace(
+ self._draw_line(
+ x=x[group],
+ y=y[group],
+ mode="lines+markers",
+ marker_symbol="circle",
+ error_y=dict(type="data", array=std[group], visible=True),
+ parent=group,
+ child=self._metric[met].name,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ # Add error bands
+ if m.bootstrap is not None:
+ fillcolor = f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)"
+ fig.add_traces(
+ [
+ go.Scatter(
+ x=x[group],
+ y=np.add(y[group], std[group]),
+ mode="lines",
+ line=dict(width=1, color=BasePlot._fig.get_elem(group)),
+ hovertemplate="%{y}upper bound",
+ legendgroup=group,
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ ),
+ go.Scatter(
+ x=x[group],
+ y=np.subtract(y[group], std[group]),
+ mode="lines",
+ line=dict(width=1, color=BasePlot._fig.get_elem(group)),
+ fill="tonexty",
+ fillcolor=fillcolor,
+ hovertemplate="%{y}lower bound",
+ legendgroup=group,
+ showlegend=False,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ ),
+ ]
+ )
+
+ fig.update_layout({f"xaxis{yaxis[1:]}": dict(dtick=1, autorange="reversed")})
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ groupclick="togglegroup",
+ title=title,
+ legend=legend,
+ xlabel="n_models",
+ ylabel="Score",
+ figsize=figsize,
+ plotname="plot_successive_halving",
+ filename=filename,
+ display=display,
+ )
+
+ @available_if(has_task(["binary", "multilabel"]))
+ @composed(crash, plot_from_model)
+ def plot_threshold(
+ self,
+ models: INT | str | MODEL | slice | SEQUENCE | None = None,
+ metric: METRIC_SELECTOR = None,
+ dataset: str = "test",
+ target: INT | str = 0,
+ steps: INT = 100,
+ *,
+ title: str | dict | None = None,
+ legend: str | dict | None = "lower left",
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> go.Figure | None:
+ """Plot metric performances against threshold values.
+
+ This plot is available only for models with a `predict_proba`
+ method in a binary or [multilabel][] classification task.
+
+ Parameters
+ ----------
+ models: int, str, Model, slice, sequence or None, default=None
+ Models to plot. If None, all models are selected.
+
+ metric: str, func, scorer, sequence or None, default=None
+ Metric to plot. Choose from any of sklearn's scorers, a
+ function with signature `metric(y_true, y_pred)`, a scorer
+ object or a sequence of these. Use a sequence or add `+`
+ between options to select more than one. If None, the
+ metric used to run the pipeline is selected.
+
+ dataset: str, default="test"
+ Data set on which to calculate the metric. Choose from:
+ "train", "test" or "holdout".
+
+ target: int or str, default=0
+ Target column to look at. Only for [multilabel][] tasks.
+
+ steps: int, default=100
+ Number of thresholds measured.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default="lower left"
+ Legend for the plot. See the [user guide][parameters] for
+ an extended description of the choices.
+
+ - If None: No legend is shown.
+ - If str: Location where to show the legend.
+ - If dict: Legend configuration.
+
+ figsize: tuple, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as html. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [go.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_calibration
+ atom.plots:PredictionPlot.plot_confusion_matrix
+ atom.plots:PredictionPlot.plot_probabilities
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import make_classification
+
+ X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run(["LR", "RF"])
+ atom.plot_threshold()
+ ```
+
+ """
+ check_predict_proba(models, "plot_threshold")
+ ds = self._get_set(dataset, max_one=True)
+ target = self.branch._get_target(target, only_columns=True)
+
+ # Get all metric functions from the input
+ if metric is None:
+ metrics = [m._score_func for m in self._metric]
+ else:
+ metrics = []
+ for m in lst(metric):
+ if isinstance(m, str):
+ metrics.extend(m.split("+"))
+ else:
+ metrics.append(m)
+ metrics = [get_custom_scorer(m)._score_func for m in metrics]
+
+ fig = self._get_figure()
+ xaxis, yaxis = BasePlot._fig.get_axes()
+
+ steps = np.linspace(0, 1, steps)
+ for m in models:
+ y_true, y_pred = m._get_pred(ds, target, attr="predict_proba")
+ for met in metrics:
+ fig.add_trace(
+ self._draw_line(
+ x=steps,
+ y=[met(y_true, y_pred >= step) for step in steps],
+ parent=m.name,
+ child=met.__name__,
+ legend=legend,
+ xaxis=xaxis,
+ yaxis=yaxis,
+ )
+ )
+
+ BasePlot._fig.used_models.extend(models)
+ return self._plot(
+ ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+ xlabel="Threshold",
+ ylabel="Score",
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_threshold",
+ filename=filename,
+ display=display,
+ )
diff --git a/atom/plots/shapplot.py b/atom/plots/shapplot.py
new file mode 100644
index 000000000..5c366454c
--- /dev/null
+++ b/atom/plots/shapplot.py
@@ -0,0 +1,866 @@
+# -*- coding: utf-8 -*-
+
+"""
+Automated Tool for Optimized Modelling (ATOM)
+Author: Mavs
+Description: Module containing the ShapPlot class.
+
+"""
+
+from __future__ import annotations
+
+from importlib.util import find_spec
+
+import matplotlib.pyplot as plt
+import shap
+from typeguard import typechecked
+
+from atom.plots.base import BasePlot
+from atom.utils.types import INT, LEGEND, MODEL, SEQUENCE, SLICE
+from atom.utils.utils import check_canvas, composed, crash, plot_from_model
+
+
+@typechecked
+class ShapPlot(BasePlot):
+ """Shap plots.
+
+ ATOM wrapper for plots made by the shap package, using Shapley
+ values for model interpretation. These plots are accessible from
+ the runners or from the models. Only one model can be plotted at
+ the same time since the plots are not made by ATOM.
+
+ """
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_shap_bar(
+ self,
+ models: INT | str | MODEL | None = None,
+ index: SLICE | None = None,
+ show: INT | None = None,
+ target: INT | str | tuple = 1,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> plt.Figure | None:
+ """Plot SHAP's bar plot.
+
+ Create a bar plot of a set of SHAP values. If a single sample
+ is passed, then the SHAP values are plotted. If many samples
+ are passed, then the mean absolute value for each feature
+ column is plotted. Read more about SHAP plots in the
+ [user guide][shap].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_shap_bar()`.
+
+ index: int, str, slice, sequence or None, default=None
+ Rows in the dataset to plot. If None, it selects all rows
+ in the test set.
+
+ show: int or None, default=None
+ Number of features (ordered by importance) to show. If
+ None, it shows all features.
+
+ target: int, str or tuple, default=1
+ Class in the target column to target. For multioutput tasks,
+ the value should be a tuple of the form (column, class).
+ Note that for binary and multilabel tasks, the selected
+ class is always the positive one.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of features shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as png. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [plt.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_parshap
+ atom.plots:ShapPlot.plot_shap_beeswarm
+ atom.plots:ShapPlot.plot_shap_scatter
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run("LR")
+ atom.plot_shap_bar(show=10)
+ ```
+
+ """
+ rows = models.X.loc[models.branch._get_rows(index)]
+ show = self._get_show(show, models)
+ target = self.branch._get_target(target)
+ explanation = models._shap.get_explanation(rows, target)
+
+ self._get_figure(backend="matplotlib")
+ check_canvas(BasePlot._fig.is_canvas, "plot_shap_bar")
+
+ shap.plots.bar(explanation, max_display=show, show=False)
+
+ BasePlot._fig.used_models.append(models)
+ return self._plot(
+ ax=plt.gca(),
+ xlabel=plt.gca().get_xlabel(),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ plotname="plot_shap_bar",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_shap_beeswarm(
+ self,
+ models: INT | str | MODEL | None = None,
+ index: slice | SEQUENCE | None = None,
+ show: INT | None = None,
+ target: INT | str | tuple = 1,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> plt.Figure | None:
+ """Plot SHAP's beeswarm plot.
+
+ The plot is colored by feature values. Read more about SHAP
+ plots in the [user guide][shap].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_shap_beeswarm()`.
+
+ index: tuple, slice or None, default=None
+ Rows in the dataset to plot. If None, it selects all rows
+ in the test set. The beeswarm plot does not support plotting
+ a single sample.
+
+ show: int or None, default=None
+ Number of features (ordered by importance) to show. If
+ None, it shows all features.
+
+ target: int, str or tuple, default=1
+ Class in the target column to target. For multioutput tasks,
+ the value should be a tuple of the form (column, class).
+ Note that for binary and multilabel tasks, the selected
+ class is always the positive one.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of features shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as png. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [plt.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:PredictionPlot.plot_parshap
+ atom.plots:ShapPlot.plot_shap_bar
+ atom.plots:ShapPlot.plot_shap_scatter
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run("LR")
+ atom.plot_shap_beeswarm(show=10)
+ ```
+
+ """
+ rows = models.X.loc[models.branch._get_rows(index)]
+ show = self._get_show(show, models)
+ target = self.branch._get_target(target)
+ explanation = models._shap.get_explanation(rows, target)
+
+ self._get_figure(backend="matplotlib")
+ check_canvas(BasePlot._fig.is_canvas, "plot_shap_beeswarm")
+
+ shap.plots.beeswarm(explanation, max_display=show, show=False)
+
+ BasePlot._fig.used_models.append(models)
+ return self._plot(
+ ax=plt.gca(),
+ xlabel=plt.gca().get_xlabel(),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_shap_decision(
+ self,
+ models: INT | str | MODEL | None = None,
+ index: SLICE | None = None,
+ show: INT | None = None,
+ target: INT | str | tuple = 1,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> plt.Figure | None:
+ """Plot SHAP's decision plot.
+
+ Visualize model decisions using cumulative SHAP values. Each
+ plotted line explains a single model prediction. If a single
+ prediction is plotted, feature values are printed in the
+ plot (if supplied). If multiple predictions are plotted
+ together, feature values will not be printed. Plotting too
+ many predictions together will make the plot unintelligible.
+ Read more about SHAP plots in the [user guide][shap].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_shap_decision()`.
+
+ index: int, str, slice, sequence or None, default=None
+ Rows in the dataset to plot. If None, it selects all rows
+ in the test set.
+
+ show: int or None, default=None
+ Number of features (ordered by importance) to show. If
+ None, it shows all features.
+
+ target: int, str or tuple, default=1
+ Class in the target column to target. For multioutput tasks,
+ the value should be a tuple of the form (column, class).
+ Note that for binary and multilabel tasks, the selected
+ class is always the positive one.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of features shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as png. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [plt.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:ShapPlot.plot_shap_bar
+ atom.plots:ShapPlot.plot_shap_beeswarm
+ atom.plots:ShapPlot.plot_shap_force
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run("LR")
+ atom.plot_shap_decision(show=10)
+ atom.plot_shap_decision(index=-1, show=10)
+ ```
+
+ """
+ rows = models.X.loc[models.branch._get_rows(index)]
+ show = self._get_show(show, models)
+ target = self.branch._get_target(target)
+ explanation = models._shap.get_explanation(rows, target)
+
+ self._get_figure(backend="matplotlib")
+ check_canvas(BasePlot._fig.is_canvas, "plot_shap_decision")
+
+ shap.decision_plot(
+ base_value=explanation.base_values,
+ shap_values=explanation.values,
+ features=rows,
+ feature_display_range=slice(-1, -show - 1, -1),
+ auto_size_plot=False,
+ show=False,
+ )
+
+ BasePlot._fig.used_models.append(models)
+ return self._plot(
+ ax=plt.gca(),
+ xlabel=plt.gca().get_xlabel(),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ plotname="plot_shap_decision",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_shap_force(
+ self,
+ models: INT | str | MODEL | None = None,
+ index: SLICE | None = None,
+ target: INT | str | tuple = 1,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] = (900, 300),
+ filename: str | None = None,
+ display: bool | None = True,
+ **kwargs,
+ ) -> plt.Figure | None:
+ """Plot SHAP's force plot.
+
+ Visualize the given SHAP values with an additive force layout.
+ Note that by default this plot will render using javascript.
+ For a regular figure use `matplotlib=True` (this option is
+ only available when only a single sample is plotted). Read more
+ about SHAP plots in the [user guide][shap].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_shap_force()`.
+
+ index: int, str, slice, sequence or None, default=None
+ Rows in the dataset to plot. If None, it selects all rows
+ in the test set.
+
+ target: int, str or tuple, default=1
+ Class in the target column to target. For multioutput tasks,
+ the value should be a tuple of the form (column, class).
+ Note that for binary and multilabel tasks, the selected
+ class is always the positive one.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=(900, 300)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as png. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ **kwargs
+ Additional keyword arguments for [shap.plots.force][force].
+
+ Returns
+ -------
+ [plt.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:ShapPlot.plot_shap_beeswarm
+ atom.plots:ShapPlot.plot_shap_scatter
+ atom.plots:ShapPlot.plot_shap_decision
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run("LR")
+ atom.plot_shap_force(index=-2, matplotlib=True, figsize=(1800, 300))
+ ```
+
+ """
+ rows = models.X.loc[models.branch._get_rows(index)]
+ target = self.branch._get_target(target)
+ explanation = models._shap.get_explanation(rows, target)
+
+ self._get_figure(create_figure=False, backend="matplotlib")
+ check_canvas(BasePlot._fig.is_canvas, "plot_shap_force")
+
+ plot = shap.force_plot(
+ base_value=explanation.base_values,
+ shap_values=explanation.values,
+ features=rows,
+ show=False,
+ **kwargs,
+ )
+
+ if kwargs.get("matplotlib"):
+ BasePlot._fig.used_models.append(models)
+ return self._plot(
+ fig=plt.gcf(),
+ ax=plt.gca(),
+ title=title,
+ legend=legend,
+ figsize=figsize,
+ plotname="plot_shap_force",
+ filename=filename,
+ display=display,
+ )
+ else:
+ if filename: # Save to a html file
+ if not filename.endswith(".html"):
+ filename += ".html"
+ shap.save_html(filename, plot)
+ if display and find_spec("IPython"):
+ from IPython.display import display
+
+ shap.initjs()
+ display(plot)
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_shap_heatmap(
+ self,
+ models: INT | str | MODEL | None = None,
+ index: slice | SEQUENCE | None = None,
+ show: INT | None = None,
+ target: INT | str | tuple = 1,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> plt.Figure | None:
+ """Plot SHAP's heatmap plot.
+
+ This plot is designed to show the population substructure of a
+ dataset using supervised clustering and a heatmap. Supervised
+ clustering involves clustering data points not by their original
+ feature values but by their explanations. Read more about SHAP
+ plots in the [user guide][shap].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_shap_heatmap()`.
+
+ index: slice, sequence or None, default=None
+ Rows in the dataset to plot. If None, it selects all rows
+ in the test set. The plot_shap_heatmap method does not
+ support plotting a single sample.
+
+ show: int or None, default=None
+ Number of features (ordered by importance) to show. If
+ None, it shows all features.
+
+ target: int, str or tuple, default=1
+ Class in the target column to target. For multioutput tasks,
+ the value should be a tuple of the form (column, class).
+ Note that for binary and multilabel tasks, the selected
+ class is always the positive one.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of features shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as png. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [plt.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:ShapPlot.plot_shap_decision
+ atom.plots:ShapPlot.plot_shap_force
+ atom.plots:ShapPlot.plot_shap_waterfall
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run("LR")
+ atom.plot_shap_heatmap(show=10)
+ ```
+
+ """
+ rows = models.X.loc[models.branch._get_rows(index)]
+ show = self._get_show(show, models)
+ target = self.branch._get_target(target)
+ explanation = models._shap.get_explanation(rows, target)
+
+ self._get_figure(backend="matplotlib")
+ check_canvas(BasePlot._fig.is_canvas, "plot_shap_heatmap")
+
+ shap.plots.heatmap(explanation, max_display=show, show=False)
+
+ BasePlot._fig.used_models.append(models)
+ return self._plot(
+ ax=plt.gca(),
+ xlabel=plt.gca().get_xlabel(),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ plotname="plot_shap_heatmap",
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_shap_scatter(
+ self,
+ models: INT | str | MODEL | None = None,
+ index: slice | SEQUENCE | None = None,
+ columns: INT | str = 0,
+ target: INT | str | tuple = 1,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] = (900, 600),
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> plt.Figure | None:
+ """Plot SHAP's scatter plot.
+
+ Plots the value of the feature on the x-axis and the SHAP value
+ of the same feature on the y-axis. This shows how the model
+ depends on the given feature, and is like a richer extension of
+ the classical partial dependence plots. Vertical dispersion of
+ the data points represents interaction effects. Read more about
+ SHAP plots in the [user guide][shap].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_shap_scatter()`.
+
+ index: slice, sequence or None, default=None
+ Rows in the dataset to plot. If None, it selects all rows
+ in the test set. The plot_shap_scatter method does not
+ support plotting a single sample.
+
+ columns: int or str, default=0
+ Column to plot.
+
+ target: int, str or tuple, default=1
+ Class in the target column to target. For multioutput tasks,
+ the value should be a tuple of the form (column, class).
+ Note that for binary and multilabel tasks, the selected
+ class is always the positive one.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=(900, 600)
+ Figure's size in pixels, format as (x, y).
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as png. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [plt.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:ShapPlot.plot_shap_beeswarm
+ atom.plots:ShapPlot.plot_shap_decision
+ atom.plots:ShapPlot.plot_shap_force
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run("LR")
+ atom.plot_shap_scatter(columns="symmetry error")
+ ```
+
+ """
+ rows = models.X.loc[models.branch._get_rows(index)]
+ column = models.branch._get_columns(columns, include_target=False)[0]
+ target = self.branch._get_target(target)
+ explanation = models._shap.get_explanation(rows, target)
+
+ # Get explanation for a specific column
+ explanation = explanation[:, models.columns.get_loc(column)]
+
+ self._get_figure(backend="matplotlib")
+ check_canvas(BasePlot._fig.is_canvas, "plot_shap_scatter")
+
+ shap.plots.scatter(explanation, color=explanation, ax=plt.gca(), show=False)
+
+ BasePlot._fig.used_models.append(models)
+ return self._plot(
+ ax=plt.gca(),
+ xlabel=plt.gca().get_xlabel(),
+ ylabel=plt.gca().get_ylabel(),
+ title=title,
+ legend=legend,
+ plotname="plot_shap_scatter",
+ figsize=figsize,
+ filename=filename,
+ display=display,
+ )
+
+ @composed(crash, plot_from_model(max_one=True))
+ def plot_shap_waterfall(
+ self,
+ models: INT | str | MODEL | None = None,
+ index: INT | str | None = None,
+ show: INT | None = None,
+ target: INT | str | tuple = 1,
+ *,
+ title: str | dict | None = None,
+ legend: LEGEND | dict | None = None,
+ figsize: tuple[INT, INT] | None = None,
+ filename: str | None = None,
+ display: bool | None = True,
+ ) -> plt.Figure | None:
+ """Plot SHAP's waterfall plot.
+
+ The SHAP value of a feature represents the impact of the
+ evidence provided by that feature on the model’s output. The
+ waterfall plot is designed to visually display how the SHAP
+ values (evidence) of each feature move the model output from
+ our prior expectation under the background data distribution,
+ to the final model prediction given the evidence of all the
+ features. Features are sorted by the magnitude of their SHAP
+ values with the smallest magnitude features grouped together
+ at the bottom of the plot when the number of features in the
+ models exceeds the `show` parameter. Read more about SHAP plots
+ in the [user guide][shap].
+
+ Parameters
+ ----------
+ models: int, str, Model or None, default=None
+ Model to plot. If None, all models are selected. Note that
+ leaving the default option could raise an exception if there
+ are multiple models. To avoid this, call the plot directly
+ from a model, e.g. `atom.lr.plot_shap_waterfall()`.
+
+ index: int, str or None, default=None
+ Rows in the dataset to plot. If None, it selects all rows
+ in the test set. The plot_shap_waterfall method does not
+ support plotting multiple samples.
+
+ show: int or None, default=None
+ Number of features (ordered by importance) to show. If
+ None, it shows all features.
+
+ target: int, str or tuple, default=1
+ Class in the target column to target. For multioutput tasks,
+ the value should be a tuple of the form (column, class).
+ Note that for binary and multilabel tasks, the selected
+ class is always the positive one.
+
+ title: str, dict or None, default=None
+ Title for the plot.
+
+ - If None, no title is shown.
+ - If str, text for the title.
+ - If dict, [title configuration][parameters].
+
+ legend: str, dict or None, default=None
+ Does nothing. Implemented for continuity of the API.
+
+ figsize: tuple or None, default=None
+ Figure's size in pixels, format as (x, y). If None, it
+ adapts the size to the number of features shown.
+
+ filename: str or None, default=None
+ Save the plot using this name. Use "auto" for automatic
+ naming. The type of the file depends on the provided name
+ (.html, .png, .pdf, etc...). If `filename` has no file type,
+ the plot is saved as png. If None, the plot is not saved.
+
+ display: bool or None, default=True
+ Whether to render the plot. If None, it returns the figure.
+
+ Returns
+ -------
+ [plt.Figure][] or None
+ Plot object. Only returned if `display=None`.
+
+ See Also
+ --------
+ atom.plots:ShapPlot.plot_shap_bar
+ atom.plots:ShapPlot.plot_shap_beeswarm
+ atom.plots:ShapPlot.plot_shap_heatmap
+
+ Examples
+ --------
+ ```pycon
+ from atom import ATOMClassifier
+ from sklearn.datasets import load_breast_cancer
+
+ X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+
+ atom = ATOMClassifier(X, y, random_state=1)
+ atom.run("LR")
+ atom.plot_shap_waterfall(show=10)
+ ```
+
+ """
+ rows = models.X.loc[[models.branch._get_rows(index)[0]]]
+ show = self._get_show(show, models)
+ target = self.branch._get_target(target)
+ explanation = models._shap.get_explanation(rows, target)
+
+ # Waterfall accepts only one row
+ explanation.values = explanation.values[0]
+ explanation.data = explanation.data[0]
+
+ self._get_figure(backend="matplotlib")
+ check_canvas(BasePlot._fig.is_canvas, "plot_shap_waterfall")
+
+ shap.plots.waterfall(explanation, max_display=show, show=False)
+
+ BasePlot._fig.used_models.append(models)
+ return self._plot(
+ ax=plt.gca(),
+ title=title,
+ legend=legend,
+ figsize=figsize or (900, 400 + show * 50),
+ plotname="plot_shap_waterfall",
+ filename=filename,
+ display=display,
+ )
diff --git a/atom/training.py b/atom/training.py
index de5af1989..decf3926a 100644
--- a/atom/training.py
+++ b/atom/training.py
@@ -20,7 +20,8 @@
from atom.basetrainer import BaseTrainer
from atom.utils.types import (
- BOOL, ENGINE, GOAL, INT, INT_TYPES, METRIC_SELECTOR, PREDICTOR, SEQUENCE,
+ BOOL, ENGINE, INT, INT_TYPES, METRIC_SELECTOR, PREDICTOR, SEQUENCE,
+ WARNINGS,
)
from atom.utils.utils import (
ClassMap, composed, crash, get_best_score, infer_task, lst, method_to_log,
@@ -341,7 +342,7 @@ class DirectClassifier(Direct):
- "keep": Keep the model in its state at failure. Note that
this model can break down many other methods after training.
This option is useful to be able to rerun hyperparameter
- optimization after failure without losing previous succesfull
+ optimization after failure without losing previous successful
trials.
n_jobs: int, default=1
@@ -460,12 +461,12 @@ def __init__(
engine: ENGINE = {"data": "numpy", "estimator": "sklearn"},
backend: str = "loky",
verbose: Literal[0, 1, 2] = 0,
- warnings: BOOL | str = False,
+ warnings: BOOL | WARNINGS = False,
logger: str | Logger | None = None,
experiment: str | None = None,
random_state: INT | None = None,
):
- self.goal: GOAL = "class"
+ self.goal = "class"
super().__init__(
models, metric, est_params, n_trials, ht_params, n_bootstrap,
parallel, errors, n_jobs, device, engine, backend, verbose,
@@ -559,7 +560,7 @@ class DirectForecaster(Direct):
- "keep": Keep the model in its state at failure. Note that
this model can break down many other methods after training.
This option is useful to be able to rerun hyperparameter
- optimization after failure without losing previous succesfull
+ optimization after failure without losing previous successful
trials.
n_jobs: int, default=1
@@ -675,12 +676,12 @@ def __init__(
engine: ENGINE = {"data": "numpy", "estimator": "sklearn"},
backend: str = "loky",
verbose: Literal[0, 1, 2] = 0,
- warnings: BOOL | str = False,
+ warnings: BOOL | WARNINGS = False,
logger: str | Logger | None = None,
experiment: str | None = None,
random_state: INT | None = None,
):
- self.goal: GOAL = "fc"
+ self.goal = "fc"
super().__init__(
models, metric, est_params, n_trials, ht_params, n_bootstrap,
parallel, errors, n_jobs, device, engine, backend, verbose, warnings,
@@ -774,7 +775,7 @@ class DirectRegressor(Direct):
- "keep": Keep the model in its state at failure. Note that
this model can break down many other methods after training.
This option is useful to be able to rerun hyperparameter
- optimization after failure without losing previous succesfull
+ optimization after failure without losing previous successful
trials.
n_jobs: int, default=1
@@ -886,19 +887,19 @@ def __init__(
n_trials: INT | dict | SEQUENCE = 0,
ht_params: dict | None = None,
n_bootstrap: INT | dict | SEQUENCE = 0,
- parallel: bool = False,
+ parallel: BOOL = False,
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: INT = 1,
device: str = "cpu",
engine: ENGINE = {"data": "numpy", "estimator": "sklearn"},
backend: str = "loky",
verbose: Literal[0, 1, 2] = 0,
- warnings: bool | str = False,
+ warnings: BOOL | str = False,
logger: str | Logger | None = None,
experiment: str | None = None,
random_state: INT | None = None,
):
- self.goal: GOAL = "reg"
+ self.goal = "reg"
super().__init__(
models, metric, est_params, n_trials, ht_params, n_bootstrap,
parallel, errors, n_jobs, device, engine, backend, verbose, warnings,
@@ -999,7 +1000,7 @@ class SuccessiveHalvingClassifier(SuccessiveHalving):
- "keep": Keep the model in its state at failure. Note that
this model can break down many other methods after training.
This option is useful to be able to rerun hyperparameter
- optimization after failure without losing previous succesfull
+ optimization after failure without losing previous successful
trials.
n_jobs: int, default=1
@@ -1112,19 +1113,19 @@ def __init__(
n_trials: INT | dict | SEQUENCE = 0,
ht_params: dict | None = None,
n_bootstrap: INT | dict | SEQUENCE = 0,
- parallel: bool = False,
+ parallel: BOOL = False,
errors: Literal["raise", "skip", "keep"] = "skip",
n_jobs: INT = 1,
device: str = "cpu",
engine: ENGINE = {"data": "numpy", "estimator": "sklearn"},
backend: str = "loky",
verbose: Literal[0, 1, 2] = 0,
- warnings: bool | str = False,
+ warnings: BOOL | str = False,
logger: str | Logger | None = None,
experiment: str | None = None,
random_state: INT | None = None,
):
- self.goal: GOAL = "class"
+ self.goal = "class"
super().__init__(
models, metric, skip_runs, est_params, n_trials, ht_params,
n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
@@ -1221,7 +1222,7 @@ class SuccessiveHalvingForecaster(SuccessiveHalving):
- "keep": Keep the model in its state at failure. Note that
this model can break down many other methods after training.
This option is useful to be able to rerun hyperparameter
- optimization after failure without losing previous succesfull
+ optimization after failure without losing previous successful
trials.
n_jobs: int, default=1
@@ -1343,7 +1344,7 @@ def __init__(
experiment: str | None = None,
random_state: INT | None = None,
):
- self.goal: GOAL = "fc"
+ self.goal = "fc"
super().__init__(
models, metric, skip_runs, est_params, n_trials, ht_params,
n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
@@ -1440,7 +1441,7 @@ class SuccessiveHalvingRegressor(SuccessiveHalving):
- "keep": Keep the model in its state at failure. Note that
this model can break down many other methods after training.
This option is useful to be able to rerun hyperparameter
- optimization after failure without losing previous succesfull
+ optimization after failure without losing previous successful
trials.
n_jobs: int, default=1
@@ -1565,7 +1566,7 @@ def __init__(
experiment: str | None = None,
random_state: INT | None = None,
):
- self.goal: GOAL = "reg"
+ self.goal = "reg"
super().__init__(
models, metric, skip_runs, est_params, n_trials, ht_params,
n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
@@ -1671,7 +1672,7 @@ class TrainSizingClassifier(TrainSizing):
- "keep": Keep the model in its state at failure. Note that
this model can break down many other methods after training.
This option is useful to be able to rerun hyperparameter
- optimization after failure without losing previous succesfull
+ optimization after failure without losing previous successful
trials.
n_jobs: int, default=1
@@ -1796,7 +1797,7 @@ def __init__(
experiment: str | None = None,
random_state: INT | None = None,
):
- self.goal: GOAL = "class"
+ self.goal = "class"
super().__init__(
models, metric, train_sizes, est_params, n_trials, ht_params,
n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
@@ -1898,7 +1899,7 @@ class TrainSizingForecaster(TrainSizing):
- "keep": Keep the model in its state at failure. Note that
this model can break down many other methods after training.
This option is useful to be able to rerun hyperparameter
- optimization after failure without losing previous succesfull
+ optimization after failure without losing previous successful
trials.
n_jobs: int, default=1
@@ -2020,7 +2021,7 @@ def __init__(
experiment: str | None = None,
random_state: INT | None = None,
):
- self.goal: GOAL = "fc"
+ self.goal = "fc"
super().__init__(
models, metric, train_sizes, est_params, n_trials, ht_params,
n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
@@ -2122,7 +2123,7 @@ class TrainSizingRegressor(TrainSizing):
- "keep": Keep the model in its state at failure. Note that
this model can break down many other methods after training.
This option is useful to be able to rerun hyperparameter
- optimization after failure without losing previous succesfull
+ optimization after failure without losing previous successful
trials.
n_jobs: int, default=1
@@ -2247,7 +2248,7 @@ def __init__(
experiment: str | None = None,
random_state: INT | None = None,
):
- self.goal: GOAL = "reg"
+ self.goal = "reg"
super().__init__(
models, metric, train_sizes, est_params, n_trials, ht_params,
n_bootstrap, parallel, errors, n_jobs, device, engine, backend,
diff --git a/atom/utils/types.py b/atom/utils/types.py
index e124cc5e4..e362619ec 100644
--- a/atom/utils/types.py
+++ b/atom/utils/types.py
@@ -9,7 +9,9 @@
from __future__ import annotations
-from typing import Callable, Literal, Protocol, TypedDict, Union
+from typing import (
+ Callable, Literal, Protocol, TypedDict, Union, runtime_checkable,
+)
import modin.pandas as md
import numpy as np
@@ -55,8 +57,6 @@
FEATURES = Union[iter, dict, list, tuple, np.ndarray, sps.spmatrix, DATAFRAME]
TARGET = Union[INT, str, dict, SEQUENCE, DATAFRAME]
-BACKEND = Literal["loky", "multiprocessing", "threading", "ray"]
-
DATASET = Literal[
"dataset",
"train",
@@ -73,19 +73,40 @@
]
# Selection of rows or columns by name or position
-SLICE = Union[INT | str | slice | SEQUENCE]
+SLICE = Union[INT, str, slice, SEQUENCE]
# Assignment of index or stratify parameter
-INDEX_SELECTOR = Union[bool | INT | str | SEQUENCE]
+INDEX_SELECTOR = Union[bool, INT, str, SEQUENCE]
-# Allowed values for the goal attribute
-GOAL = Literal["class", "reg", "fc"]
+# Types to initialize a metric
+METRIC_SELECTOR = (str, Callable[..., SCALAR], SEQUENCE, None)
-# Metric selectors
-METRIC_SELECTOR = Union[str, Callable[..., SCALAR], SEQUENCE | None]
+# Allowed values for BaseTransformer parameter
+BACKEND = Literal["loky", "multiprocessing", "threading", "ray"]
+WARNINGS = Literal["default", "error", "ignore", "always", "module", "once"]
-# Pruning strategies
-PRUNING = Literal["zscore", "iforest", "ee", "lof", "svm", "dbscan", "hdbscan", "optics"]
+# Data cleaning parameters
+STRAT_NUM = SCALAR | Literal["drop", "mean", "median", "knn", "most_frequent"]
+DISCRETIZER_STRATS = Literal["uniform", "quantile", "kmeans", "custom"]
+PRUNER_STRATS = Literal[
+ "zscore", "iforest", "ee", "lof", "svm", "dbscan", "hdbscan", "optics"
+]
+SCALER_STRATS = Literal["standard", "minmax", "maxabs", "robust"]
+
+
+# Plotting parameters
+LEGEND = Literal[
+ "upper left",
+ "lower left",
+ "upper right",
+ "lower right",
+ "upper center",
+ "lower center",
+ "center left",
+ "center right",
+ "center",
+ "out",
+]
# Classes for type hinting ========================================= >>
@@ -96,28 +117,32 @@ class ENGINE(TypedDict, total=False):
estimator: Literal["sklearn", "sklearnex", "cuml"]
+@runtime_checkable
class SCORER(Protocol):
"""Protocol for all scorers."""
def _score(self, method_caller, clf, X, y, sample_weight=None): ...
+@runtime_checkable
class TRANSFORMER(Protocol):
"""Protocol for all predictors."""
- def fit(self, **params): ...
def transform(self, **params): ...
+@runtime_checkable
class PREDICTOR(Protocol):
"""Protocol for all predictors."""
def fit(self, **params): ...
def predict(self, **params): ...
+@runtime_checkable
class ESTIMATOR(Protocol):
"""Protocol for all estimators."""
def fit(self, **params): ...
+@runtime_checkable
class BRANCH(Protocol):
"""Protocol for the Branch class."""
def _get_rows(self, **params): ...
@@ -125,12 +150,14 @@ def _get_columns(self, **params): ...
def _get_target(self, **params): ...
+@runtime_checkable
class MODEL(Protocol):
"""Protocol for all models."""
- def est_class(self): ...
- def get_estimator(self, **params): ...
+ def _est_class(self): ...
+ def _get_est(self, **params): ...
+@runtime_checkable
class RUNNER(Protocol):
"""Protocol for all runners."""
def run(self, **params): ...
diff --git a/atom/utils/utils.py b/atom/utils/utils.py
index dce01fcb1..9f2fb715e 100644
--- a/atom/utils/utils.py
+++ b/atom/utils/utils.py
@@ -14,7 +14,7 @@
import sys
import tempfile
import warnings
-from collections import OrderedDict, deque
+from collections import deque
from collections.abc import MutableMapping
from contextlib import contextmanager
from copy import copy, deepcopy
@@ -25,10 +25,10 @@
from importlib.util import find_spec
from inspect import Parameter, signature
from itertools import cycle
-from types import GeneratorType
+from types import GeneratorType, MappingProxyType
from typing import Any, Callable
from unittest.mock import patch
-
+from joblib import Memory
import mlflow
import modin.pandas as md
import numpy as np
@@ -54,7 +54,7 @@
BRANCH, DATAFRAME, DATAFRAME_TYPES, ESTIMATOR, FEATURES, FLOAT,
INDEX_SELECTOR, INT, INT_TYPES, MODEL, PANDAS, PANDAS_TYPES, PREDICTOR,
SCALAR, SCORER, SEQUENCE, SEQUENCE_TYPES, SERIES, SERIES_TYPES, TARGET,
- TRANSFORMER,
+ TRANSFORMER, BOOL
)
@@ -118,10 +118,11 @@ def __init__(self, scorer: SCORER, task: str):
self.scorer = scorer
self.task = task
- @staticmethod
- def get_final_error(error: FLOAT, weight: FLOAT) -> FLOAT:
+ def get_final_error(self, error: FLOAT, weight: FLOAT) -> FLOAT:
"""Returns final value of metric based on error and weight.
+ Can't be a `staticmethod` because of CatBoost's implementation.
+
Parameters
----------
error: float
@@ -1253,7 +1254,7 @@ def to_rgb(c: str) -> str:
return c
-def sign(obj: Callable) -> OrderedDict:
+def sign(obj: Callable) -> MappingProxyType:
"""Get the parameters of an object.
Parameters
@@ -1263,7 +1264,7 @@ def sign(obj: Callable) -> OrderedDict:
Returns
-------
- OrderedDict
+ mappingproxy
Object's parameters.
"""
@@ -1315,7 +1316,7 @@ def get_cols(elem: PANDAS) -> list[SERIES]:
def variable_return(
X: DATAFRAME | None,
y: SERIES | None,
-) -> DATAFRAME | SERIES | tuple[DATAFRAME, SERIES]:
+) -> DATAFRAME | SERIES | tuple[DATAFRAME, PANDAS]:
"""Return one or two arguments depending on which is None.
This utility is used to make methods return only the provided
@@ -1326,7 +1327,7 @@ def variable_return(
X: dataframe or None
Feature set.
- y: series or None
+ y: series, dataframe or None
Target column.
Returns
@@ -1666,7 +1667,10 @@ def to_pyarrow(column: SERIES, inverse: bool = False) -> str:
"""
if not inverse and not column.dtype.name.endswith("[pyarrow]"):
- return f"{column.dtype.name}[pyarrow]"
+ if column.dtype.name == "object":
+ return "string[pyarrow]" # pyarrow doesn't support object
+ else:
+ return f"{column.dtype.name}[pyarrow]"
elif inverse and column.dtype.name.endswith("[pyarrow]"):
return column.dtype.name[:-9]
@@ -2092,7 +2096,12 @@ def get_feature_importance(
return np.abs(data.flatten())
-def export_pipeline(pipeline: pd.Series, model: MODEL | None, memory, verbose) -> Any:
+def export_pipeline(
+ pipeline: pd.Series,
+ model: MODEL | None = None,
+ memory: BOOL | str | Memory | None = None,
+ verbose: INT | None = None,
+) -> Any:
"""Export a pipeline to a sklearn-like object.
Optionally, you can add a model as final estimator.
@@ -2516,7 +2525,7 @@ def fit_transform_one(
y: TARGET | None = None,
message: str | None = None,
**fit_params,
-) -> tuple[DATAFRAME | None, SERIES | None]:
+) -> tuple[DATAFRAME | None, SERIES | None, TRANSFORMER]:
"""Fit and transform the data using one estimator.
Parameters
@@ -2565,10 +2574,10 @@ def fit_transform_one(
def custom_transform(
transformer: TRANSFORMER,
branch: BRANCH,
- data: tuple[DATAFRAME, SERIES] | None = None,
+ data: tuple[DATAFRAME, PANDAS] | None = None,
verbose: int | None = None,
method: str = "transform",
-) -> tuple[DATAFRAME, SERIES]:
+) -> tuple[DATAFRAME, PANDAS]:
"""Applies a transformer on a branch.
This function is generic and should work for all
@@ -2600,8 +2609,8 @@ def custom_transform(
dataframe
Feature set.
- series
- Target column.
+ series or dataframe
+ Target column(s).
"""
# Select provided data or from the branch
diff --git a/docs/404.html b/docs/404.html
index 3d4c3e8b2..5ea3a762c 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -1147,7 +1147,7 @@
- DirectRegressor
+ DirectForecaster
@@ -1189,7 +1189,7 @@
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1231,7 +1231,7 @@
- TrainSizingRegressor
+ TrainSizingForecaster
diff --git a/docs/API/ATOM/atomclassifier/index.html b/docs/API/ATOM/atomclassifier/index.html
index 6220e7e60..8806c1621 100644
--- a/docs/API/ATOM/atomclassifier/index.html
+++ b/docs/API/ATOM/atomclassifier/index.html
@@ -1288,7 +1288,7 @@
- DirectRegressor
+ DirectForecaster
@@ -1330,7 +1330,7 @@
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1372,7 +1372,7 @@
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3392,7 +3392,7 @@
ATOMClassifier
-
class atom.api.
ATOMClassifier(*arrays, y=-1, index=False, shuffle=True, stratify=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device="cpu", engine=None, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)
[source] Main class for classification tasks.
+
class atom.api.
ATOMClassifier(*arrays, y=-1, index=False, shuffle=True, stratify=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)
[source] Main class for classification tasks.
Apply all data transformations and model management provided by
the package on a given dataset. Note that, contrary to sklearn's
API, the instance contains the dataset on which to perform the
@@ -3427,7 +3427,6 @@
ATOMClassifier
y: int, str, dict, sequence or dataframe, default=-1
Target column corresponding to X.
-- If None: y is ignored.
- If int: Position of the target column in X.
- If str: Name of the target column in X.
- If sequence: Target array with shape=(n_samples,) or
@@ -3496,17 +3495,16 @@
ATOMClassifier
follows the SYCL_DEVICE_FILTER filter selector, e.g.
device="gpu"
to use the GPU. Read more in the
user guide.
-
engine: dict or None, default=None
engine: dict, default={"data": "numpy", "estimator": "sklearn"}
Execution engine to use for
data and
estimators. The value should be a
dictionary with keys
data
and/or
estimator
, with their
-corresponding choice as values. If None, the default options
-are selected. Choose from:
+corresponding choice as values. Choose from:
-
"data":
-- "numpy" (default)
+- "numpy"
- "pyarrow"
- "modin"
@@ -3514,7 +3512,7 @@ ATOMClassifier
-
"estimator":
-- "sklearn" (default)
+- "sklearn"
- "sklearnex"
- "cuml"
@@ -3611,15 +3609,15 @@ Data attributes
visualize the pipeline, use the plot_pipeline method.
mapping: dict
Encoded values and their respective mapped values.
The column name is the key to its mapping dictionary. Only for
columns mapped to a single column (e.g. Ordinal, Leave-one-out,
-etc...).
dataset: dataframe
Complete data set.
train: dataframe
Training set.
test: dataframe
Test set.
X: dataframe
Feature set.
y: series | dataframe
Target column(s).
X_train: dataframe
Features of the training set.
y_train: series | dataframe
Target column(s) of the training set.
X_test: dataframe
Features of the test set.
y_test: series | dataframe
Target column(s) of the test set.
shape: tuple[int, int]
Shape of the dataset (n_rows, n_columns).
columns: series
Name of all the columns.
n_columns: int
Number of columns.
features: series
Name of the features.
n_features: int
Number of features.
target: str | list[str]
Name of the target column(s).
scaled: bool
Whether the feature set is scaled.
+etc...).
dataset: dataframe
Complete data set.
train: dataframe
Training set.
test: dataframe
Test set.
X: dataframe
Feature set.
y: series | dataframe
Target column(s).
X_train: dataframe
Features of the training set.
y_train: series | dataframe
Target column(s) of the training set.
X_test: dataframe
Features of the test set.
y_test: series | dataframe
Target column(s) of the test set.
shape: tuple[int, int]
Shape of the dataset (n_rows, n_columns).
columns: index
Name of all the columns.
n_columns: int
Number of columns.
features: index
Name of the features.
n_features: int
Number of features.
target: str | list[str]
Name of the target column(s).
scaled: bool
Whether the feature set is scaled.
A data set is considered scaled when it has mean=0 and std=1,
or when there is a scaler in the pipeline. Binary columns (only
-0s and 1s) are excluded from the calculation.
duplicates: series
Number of duplicate rows in the dataset.
missing: list
Values that are considered "missing".
+0s and 1s) are excluded from the calculation.
duplicates: int
Number of duplicate rows in the dataset.
missing: list
Values that are considered "missing".
These values are used by the clean and
impute methods. Default values are: None, NaN,
NaT, +inf, -inf, "", "?", "None", "NA", "nan", "NaN", "NaT",
"inf". Note that None, NaN, +inf and -inf are always considered
-missing since they are incompatible with sklearn estimators.
nans: series | None
Columns with the number of missing values in them.
n_nans: int | None
Number of samples containing missing values.
numerical: series
Names of the numerical features in the dataset.
n_numerical: int
Number of numerical features in the dataset.
categorical: series
Names of the categorical features in the dataset.
n_categorical: int
Number of categorical features in the dataset.
outliers: series | None
Columns in training set with amount of outlier values.
n_outliers: int | None
Number of samples in the training set containing outliers.
classes: pd.DataFrame | None
Distribution of target classes per data set.
n_classes: int | series | None
Number of classes in the target column(s).
+missing since they are incompatible with sklearn estimators.nans: series | None
Columns with the number of missing values in them.
n_nans: int | None
Number of samples containing missing values.
numerical: index
Names of the numerical features in the dataset.
n_numerical: int
Number of numerical features in the dataset.
categorical: index
Names of the categorical features in the dataset.
n_categorical: int
Number of categorical features in the dataset.
outliers: pd.Series | None
Columns in training set with amount of outlier values.
n_outliers: int | None
Number of samples in the training set containing outliers.
classes: pd.DataFrame | None
Distribution of target classes per data set.
n_classes: int | series | None
Number of classes in the target column(s).
Utility attributes
@@ -3669,7 +3667,7 @@ Plot attributes
The plot attributes are used to customize the plot's aesthetics. Read
more in the user guide.
-Attributes | palette: str | SEQUENCE
Color palette.
+ Attributes | palette: str | sequence
Color palette.
Specify one of plotly's built-in palettes or create
a custom one, e.g. atom.palette = ["red", "green", "blue"] . title_fontsize: int
Fontsize for the plot's title. label_fontsize: int
Fontsize for the labels, legend and hover information. tick_fontsize: int
Fontsize for the ticks along the plot's axes. line_width: int
Width of the line plots. marker_size: int
Size of the markers. |
|
@@ -3681,7 +3679,7 @@ Utility methods
add | Add a transformer to the pipeline. |
apply | Apply a function to the dataset. |
automl | Search for an optimized pipeline in an automated fashion. |
available_models | Give an overview of the available predefined models. |
canvas | Create a figure with multiple plots. |
clear | Reset attributes and clear cache from all models. |
delete | Delete models. |
distribution | Get statistics on column distributions. |
eda | Create an Exploratory Data Analysis report. |
evaluate | Get all models' scores for the provided metrics. |
export_pipeline | Export the pipeline to a sklearn-like object. |
get_class_weight | Return class weights for a balanced data set. |
get_sample_weight | Return sample weights for a balanced data set. |
inverse_transform | Inversely transform new data through the pipeline. |
load | Loads an atom instance from a pickle file. |
log | Print message and save to log file. |
merge | Merge another instance of the same class into this one. |
update_layout | Update the properties of the plot's layout. |
update_traces | Update the properties of the plot's traces. |
reset | Reset the instance to it's initial state. |
reset_aesthetics | Reset the plot aesthetics to their default values. |
save | Save the instance to a pickle file. |
save_data | Save the data in the current branch to a .csv file. |
shrink | Converts the columns to the smallest possible matching dtype. |
stacking | Add a Stacking model to the pipeline. |
stats | Display basic information about the dataset. |
status | Get an overview of the branches and models. |
transform | Transform new data through the pipeline. |
voting | Add a Voting model to the pipeline. |
-
method add(transformer, columns=None, train_only=False, **fit_params)
[source] Add a transformer to the pipeline.
+
method add(transformer, columns=None, train_only=False, **fit_params)
[source] Add a transformer to the pipeline.
If the transformer is not fitted, it is fitted on the complete
training set. Afterwards, the data set is transformed and the
estimator is added to atom's pipeline. If the estimator is
@@ -3744,10 +3742,9 @@
Utility methods
-
method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)
[source] Apply a function to the dataset.
-The function should have signature func(dataset, **kw_args) ->
-dataset
. This method is useful for stateless transformations
-such as taking the log, doing custom scaling, etc...
+
method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)
[source] Apply a function to the dataset.
+This method is useful for stateless transformations such as
+taking the log, doing custom scaling, etc...
Note
This approach is preferred over changing the dataset directly
@@ -3760,7 +3757,8 @@
Utility methods
Parameters | func: callable
-Function to apply.
+Function to apply with signature func(dataset, **kw_args) ->
+dataset .
inverse_func: callable or None, default=None
Inverse function of func . If None, the inverse_transform
method returns the input unchanged.
@@ -3771,7 +3769,7 @@ Utility methods
|
-
Search for an optimized pipeline in an automated fashion.
+
Search for an optimized pipeline in an automated fashion.
Automated machine learning (AutoML) automates the selection,
composition and parameterization of machine learning pipelines.
Automating the machine learning often provides faster, more
@@ -3793,7 +3791,7 @@
Utility methods
-
Give an overview of the available predefined models.
+
Give an overview of the available predefined models.
Returns | pd.DataFrame
Information about the available predefined models. Columns
@@ -3815,7 +3813,7 @@ Utility methods
|
-
method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)
[source] Create a figure with multiple plots.
+
method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)
[source] Create a figure with multiple plots.
This @contextmanager
allows you to draw many plots in one
figure. The default option is to add two plots side by side.
See the user guide for an example.
@@ -3860,7 +3858,7 @@ Utility methods
-
Reset attributes and clear cache from all models.
+
Reset attributes and clear cache from all models.
Reset certain model attributes to their initial state, deleting
potentially large data arrays. Use this method to free some
memory before saving the instance. The affected
@@ -3875,7 +3873,7 @@
Utility methods
Cached holdout data sets
-
Delete models.
+
Delete models.
If all models are removed, the metric is reset. Use this method
to drop unwanted models from the pipeline or to free some memory
before saving. Deleted models are not removed from
@@ -3886,7 +3884,7 @@
Utility methods
-
method distribution(distributions=None, columns=None)
[source] Get statistics on column distributions.
+
method distribution(distributions=None, columns=None)
[source] Get statistics on column distributions.
Compute the Kolmogorov-Smirnov test for various
distributions against columns in the dataset. Only for numerical
columns. Missing values are ignored.
@@ -3916,7 +3914,7 @@ Utility methods
-
method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)
[source] Create an Exploratory Data Analysis report.
+
method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)
[source] Create an Exploratory Data Analysis report.
ATOM uses the ydata-profiling package for the EDA.
The report is rendered directly in the notebook. The created
ProfileReport instance can be accessed through the report
@@ -3940,7 +3938,7 @@
Utility methods
-
method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)
[source] Get all models' scores for the provided metrics.
+
method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)
[source] Get all models' scores for the provided metrics.
Parameters | metric: str, func, scorer, sequence or None, default=None
Metric to calculate. If None, it returns an overview of
@@ -3967,7 +3965,7 @@ Utility methods
|
-
method export_pipeline(model=None, memory=None, verbose=None)
[source] Export the pipeline to a sklearn-like object.
+
method export_pipeline(model=None, memory=None, verbose=None)
[source] Export the pipeline to a sklearn-like object.
Optionally, you can add a model as final estimator. The
returned pipeline is already fitted on the training set.
@@ -4006,7 +4004,7 @@
Utility methods
-
method get_class_weight(dataset="train")
[source] Return class weights for a balanced data set.
+
method get_class_weight(dataset="train")
[source] Return class weights for a balanced data set.
Statistically, the class weights re-balance the data set so
that the sampled data set represents the target population
as closely as possible. The returned weights are inversely
@@ -4021,7 +4019,7 @@
Utility methods
-
method get_sample_weight(dataset="train")
[source] Return sample weights for a balanced data set.
+
method get_sample_weight(dataset="train")
[source] Return sample weights for a balanced data set.
The returned weights are inversely proportional to the class
frequencies in the selected data set. For multioutput tasks,
the weights of each column of y
will be multiplied.
@@ -4034,7 +4032,7 @@ Utility methods
-
method inverse_transform(X=None, y=None, verbose=None)
[source] Inversely transform new data through the pipeline.
+
method inverse_transform(X=None, y=None, verbose=None)
[source] Inversely transform new data through the pipeline.
Transformers that are only applied on the training set are
skipped. The rest should all implement a inverse_transform
method. If only X
or only y
is provided, it ignores
@@ -4064,7 +4062,7 @@
Utility methods
-
function atom.atom.
load(filename, data=None, transform_data=True, verbose=None)
[source] Loads an atom instance from a pickle file.
+
function atom.atom.
load(filename, data=None, transform_data=True, verbose=None)
[source] Loads an atom instance from a pickle file.
If the instance was saved using save_data=False
,
it's possible to load new data into it and apply all data
transformations.
@@ -4114,7 +4112,7 @@ Utility methods
-
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
+
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
Parameters | msg: int, float or str
Message to save to the logger and print to stdout.
@@ -4126,7 +4124,7 @@ Utility methods
|
-
Merge another instance of the same class into this one.
+
Merge another instance of the same class into this one.
Branches, models, metrics and attributes of the other instance
are merged into this one. If there are branches and/or models
with the same name, they are merged adding the suffix
@@ -4144,7 +4142,7 @@
Utility methods
-
Update the properties of the plot's layout.
+
Update the properties of the plot's layout.
Recursively update the structure of the original layout with
the values in the arguments.
@@ -4153,7 +4151,7 @@ Utility methods
-
Update the properties of the plot's traces.
+
Update the properties of the plot's traces.
Recursively update the structure of the original traces with
the values in the arguments.
@@ -4162,13 +4160,13 @@ Utility methods
-
Reset the instance to it's initial state.
+
Reset the instance to it's initial state.
Deletes all branches and models. The dataset is also reset
to its form after initialization.
-
Reset the plot aesthetics to their default values.
+
Reset the plot aesthetics to their default values.
-
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
+
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -4179,7 +4177,7 @@ Utility methods
|
-
method save_data(filename="auto", dataset="dataset", **kwargs)
[source] Save the data in the current branch to a .csv
file.
+
method save_data(filename="auto", dataset="dataset", **kwargs)
[source] Save the data in the current branch to a .csv
file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -4190,7 +4188,7 @@ Utility methods
|
-
method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)
[source] Converts the columns to the smallest possible matching dtype.
+
method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)
[source] Converts the columns to the smallest possible matching dtype.
Parameters | int2bool: bool, default=False
Whether to convert int columns to bool type. Only if the
@@ -4211,7 +4209,7 @@ Utility methods
|
-
method stacking(models=None, name="Stack", **kwargs)
[source] Add a Stacking model to the pipeline.
+
method stacking(models=None, name="Stack", **kwargs)
[source] Add a Stacking model to the pipeline.
Warning
Combining models trained on different branches into one
@@ -4231,18 +4229,18 @@
Utility methods
-
Display basic information about the dataset.
+
Display basic information about the dataset.
Parameters | _vb: int, default=-2
Internal parameter to always print if called by user.
|
-
Get an overview of the branches and models.
+
Get an overview of the branches and models.
This method prints the same information as the __repr__ and
also saves it to the logger.
-
method transform(X=None, y=None, verbose=None)
[source] Transform new data through the pipeline.
+
method transform(X=None, y=None, verbose=None)
[source] Transform new data through the pipeline.
Transformers that are only applied on the training set are
skipped. If only X
or only y
is provided, it ignores
transformers that require the other parameter. This can be
@@ -4272,7 +4270,7 @@
Utility methods
-
method voting(models=None, name="Vote", **kwargs)
[source] Add a Voting model to the pipeline.
+
method voting(models=None, name="Vote", **kwargs)
[source] Add a Voting model to the pipeline.
Warning
Combining models trained on different branches into one
@@ -4305,7 +4303,7 @@
Data cleaning
balance | Balance the number of rows per class in the target column. |
clean | Applies standard data cleaning steps on the dataset. |
discretize | Bin continuous data into intervals. |
encode | Perform encoding of categorical features. |
impute | Handle missing values in the dataset. |
normalize | Transform the data to follow a Normal/Gaussian distribution. |
prune | Prune outliers from the training set. |
scale | Scale the data. |
-
method balance(strategy="adasyn", **kwargs)
[source] Balance the number of rows per class in the target column.
+
method balance(strategy="adasyn", **kwargs)
[source] Balance the number of rows per class in the target column.
When oversampling, the newly created samples have an increasing
integer index for numerical indices, and an index of the form
[estimator]_N for non-numerical indices, where N stands for the
@@ -4326,7 +4324,7 @@
Data cleaning
of the target class distribution per data set.
-
method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)
[source] Applies standard data cleaning steps on the dataset.
+
method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)
[source] Applies standard data cleaning steps on the dataset.
Use the parameters to choose which transformations to perform.
The available steps are:
@@ -4340,7 +4338,7 @@ Data cleaning
See the Cleaner class for a description of the parameters.
-
method discretize(strategy="quantile", bins=5, labels=None, **kwargs)
[source] Bin continuous data into intervals.
+
method discretize(strategy="quantile", bins=5, labels=None, **kwargs)
[source] Bin continuous data into intervals.
For each feature, the bin edges are computed during fit
and, together with the number of bins, they will define the
intervals. Ignores numerical columns.
@@ -4351,7 +4349,7 @@ Data cleaning
distribution and decide on the bins.
-
method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)
[source] Perform encoding of categorical features.
+
method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)
[source] Perform encoding of categorical features.
The encoding type depends on the number of classes in the
column:
@@ -4376,7 +4374,7 @@ Data cleaning
list of the categorical features in the dataset.
-
method impute(strat_num="drop", strat_cat="drop", max_nan_rows=None, max_nan_cols=None, **kwargs)
[source] Handle missing values in the dataset.
+
method impute(strat_num="drop", strat_cat="drop", max_nan_rows=None, max_nan_cols=None, **kwargs)
[source] Handle missing values in the dataset.
Impute or remove missing values according to the selected
strategy. Also removes rows and columns with too many missing
values. Use the missing
attribute to customize what are
@@ -4388,7 +4386,7 @@
Data cleaning
missing values per column.
-
method normalize(strategy="yeojohnson", **kwargs)
[source] Transform the data to follow a Normal/Gaussian distribution.
+
method normalize(strategy="yeojohnson", **kwargs)
[source] Transform the data to follow a Normal/Gaussian distribution.
This transformation is useful for modeling issues related
to heteroscedasticity (non-constant variance), or other
situations where normality is desired. Missing values are
@@ -4401,7 +4399,7 @@
Data cleaning
distribution.
-
method prune(strategy="zscore", method="drop", max_sigma=3, include_target=False, **kwargs)
[source] Prune outliers from the training set.
+
method prune(strategy="zscore", method="drop", max_sigma=3, include_target=False, **kwargs)
[source] Prune outliers from the training set.
Replace or remove outliers. The definition of outlier depends
on the selected strategy and can greatly differ from one
another. Ignores categorical columns.
@@ -4418,7 +4416,7 @@ Data cleaning
number of outliers per column.
-
method scale(strategy="standard", include_binary=False, **kwargs)
[source] Scale the data.
+
method scale(strategy="standard", include_binary=False, **kwargs)
[source] Scale the data.
Apply one of sklearn's scalers. Categorical columns are ignored.
See the Scaler class for a description of the parameters.
@@ -4437,7 +4435,7 @@
NLP
textclean | Applies standard text cleaning to the corpus. |
textnormalize | Normalize the corpus. |
tokenize | Tokenize the corpus. |
vectorize | Vectorize the corpus. |
-
method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)
[source] Applies standard text cleaning to the corpus.
+
method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)
[source] Applies standard text cleaning to the corpus.
Transformations include normalizing characters and dropping
noise from the text (emails, HTML tags, URLs, etc...). The
transformations are applied on the column named corpus
, in
@@ -4446,7 +4444,7 @@
NLP
See the TextCleaner class for a description of the
parameters.
-
method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)
[source] Normalize the corpus.
+
method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)
[source] Normalize the corpus.
Convert words to a more uniform standard. The transformations
are applied on the column named corpus
, in the same order the
parameters are presented. If there is no column with that name,
@@ -4455,7 +4453,7 @@
NLP
See the TextNormalizer class for a description of the
parameters.
-
method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)
[source] Tokenize the corpus.
+
method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)
[source] Tokenize the corpus.
Convert documents into sequences of words. Additionally,
create n-grams (represented by words united with underscores,
e.g. "New_York") based on their frequency in the corpus. The
@@ -4463,7 +4461,7 @@
NLP
there is no column with that name, an exception is raised.
See the Tokenizer class for a description of the parameters.
-
method vectorize(strategy="bow", return_sparse=True, **kwargs)
[source] Vectorize the corpus.
+
method vectorize(strategy="bow", return_sparse=True, **kwargs)
[source] Vectorize the corpus.
Transform the corpus into meaningful vectors of numbers. The
transformation is applied on the column named corpus
. If
there is no column with that name, an exception is raised.
@@ -4484,7 +4482,7 @@
Feature engineering
feature_extraction | Extract features from datetime columns. |
feature_generation | Generate new features. |
feature_grouping | Extract statistics from similar features. |
feature_selection | Reduce the number of features in the data. |
-
method feature_extraction(features=['day', 'month', 'year'], fmt=None, encoding_type="ordinal", drop_columns=True, **kwargs)
[source] Extract features from datetime columns.
+
method feature_extraction(features=['day', 'month', 'year'], fmt=None, encoding_type="ordinal", drop_columns=True, **kwargs)
[source] Extract features from datetime columns.
Create new features extracting datetime elements (day, month,
year, etc...) from the provided columns. Columns of dtype
datetime64
are used as is. Categorical columns that can be
@@ -4493,13 +4491,13 @@
Feature engineering
See the FeatureExtractor class for a description of the
parameters.
-
method feature_generation(strategy="dfs", n_features=None, operators=None, **kwargs)
[source] Generate new features.
+
method feature_generation(strategy="dfs", n_features=None, operators=None, **kwargs)
[source] Generate new features.
Create new combinations of existing features to capture the
non-linear relations between the original features.
See the FeatureGenerator class for a description of the
parameters.
-
method feature_grouping(group, operators=None, drop_columns=True, **kwargs)
[source] Extract statistics from similar features.
+
method feature_grouping(group, operators=None, drop_columns=True, **kwargs)
[source] Extract statistics from similar features.
Replace groups of features with related characteristics with new
features that summarize statistical properties of te group. The
statistical operators are calculated over every row of the group.
@@ -4508,7 +4506,7 @@
Feature engineering
See the FeatureGrouper class for a description of the
parameters.
-
method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)
[source] Reduce the number of features in the data.
+
method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)
[source] Reduce the number of features in the data.
Apply feature selection or dimensionality reduction, either to
improve the estimators' accuracy or to boost their performance
on very high-dimensional datasets. Additionally, remove
@@ -4536,7 +4534,7 @@
Training
run | Train and evaluate the models in a direct fashion. |
successive_halving | Fit the models in a successive halving fashion. |
train_sizing | Train and evaluate the models in a train sizing fashion. |
-
method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a direct fashion.
+
method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a direct fashion.
Contrary to successive_halving and
train_sizing, the direct approach only
iterates once over the models, using the full dataset.
@@ -4553,7 +4551,7 @@
Training
See the DirectClassifier or DirectRegressor class for a
description of the parameters.
-
method successive_halving(models, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Fit the models in a successive halving fashion.
+
method successive_halving(models, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Fit the models in a successive halving fashion.
The successive halving technique is a bandit-based algorithm
that fits N models to 1/N of the data. The best half are
selected to go to the next iteration where the process is
@@ -4576,7 +4574,7 @@
Training
See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor
class for a description of the parameters.
-
method train_sizing(models, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a train sizing fashion.
+
method train_sizing(models, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a train sizing fashion.
When training models, there is usually a trade-off between
model performance and computation time, that is regulated by
the number of samples in the training set. This method can be
diff --git a/docs/API/ATOM/atomforecaster/index.html b/docs/API/ATOM/atomforecaster/index.html
index 18f2db516..3d4089054 100644
--- a/docs/API/ATOM/atomforecaster/index.html
+++ b/docs/API/ATOM/atomforecaster/index.html
@@ -1288,7 +1288,7 @@
-
- DirectRegressor
+ DirectForecaster
@@ -1330,7 +1330,7 @@
-
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1372,7 +1372,7 @@
-
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3392,7 +3392,7 @@
ATOMForecaster
-
class atom.api.
ATOMForecaster(*arrays, y=-1, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device="cpu", engine=None, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)
[source] Main class for forecasting tasks.
+
class atom.api.
ATOMForecaster(*arrays, y=-1, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)
[source] Main class for forecasting tasks.
Apply all data transformations and model management provided by
the package on a given dataset. Note that, contrary to sklearn's
API, the instance contains the dataset on which to perform the
@@ -3475,17 +3475,16 @@
ATOMForecaster
follows the
SYCL_DEVICE_FILTER filter selector, e.g.
device="gpu"
to use the GPU. Read more in the
user guide.
-
engine: dict or None, default=None
engine: dict, default={"data": "numpy", "estimator": "sklearn"}
Execution engine to use for
data and
estimators. The value should be a
dictionary with keys
data
and/or
estimator
, with their
-corresponding choice as values. If None, the default options
-are selected. Choose from:
+corresponding choice as values. Choose from:
-
"data":
-- "numpy" (default)
+- "numpy"
- "pyarrow"
- "modin"
@@ -3493,7 +3492,7 @@ ATOMForecaster
-
"estimator":
-- "sklearn" (default)
+- "sklearn"
- "sklearnex"
- "cuml"
@@ -3586,15 +3585,15 @@ Data attributes
visualize the pipeline, use the plot_pipeline method.
mapping: dict
Encoded values and their respective mapped values.
The column name is the key to its mapping dictionary. Only for
columns mapped to a single column (e.g. Ordinal, Leave-one-out,
-etc...).
dataset: dataframe
Complete data set.
train: dataframe
Training set.
test: dataframe
Test set.
X: dataframe
Feature set.
y: series | dataframe
Target column(s).
X_train: dataframe
Features of the training set.
y_train: series | dataframe
Target column(s) of the training set.
X_test: dataframe
Features of the test set.
y_test: series | dataframe
Target column(s) of the test set.
shape: tuple[int, int]
Shape of the dataset (n_rows, n_columns).
columns: series
Name of all the columns.
n_columns: int
Number of columns.
features: series
Name of the features.
n_features: int
Number of features.
target: str | list[str]
Name of the target column(s).
scaled: bool
Whether the feature set is scaled.
+etc...).
dataset: dataframe
Complete data set.
train: dataframe
Training set.
test: dataframe
Test set.
X: dataframe
Feature set.
y: series | dataframe
Target column(s).
X_train: dataframe
Features of the training set.
y_train: series | dataframe
Target column(s) of the training set.
X_test: dataframe
Features of the test set.
y_test: series | dataframe
Target column(s) of the test set.
shape: tuple[int, int]
Shape of the dataset (n_rows, n_columns).
columns: index
Name of all the columns.
n_columns: int
Number of columns.
features: index
Name of the features.
n_features: int
Number of features.
target: str | list[str]
Name of the target column(s).
scaled: bool
Whether the feature set is scaled.
A data set is considered scaled when it has mean=0 and std=1,
or when there is a scaler in the pipeline. Binary columns (only
-0s and 1s) are excluded from the calculation.
duplicates: series
Number of duplicate rows in the dataset.
missing: list
Values that are considered "missing".
+0s and 1s) are excluded from the calculation.
duplicates: int
Number of duplicate rows in the dataset.
missing: list
Values that are considered "missing".
These values are used by the clean and
impute methods. Default values are: None, NaN,
NaT, +inf, -inf, "", "?", "None", "NA", "nan", "NaN", "NaT",
"inf". Note that None, NaN, +inf and -inf are always considered
-missing since they are incompatible with sklearn estimators.
nans: series | None
Columns with the number of missing values in them.
n_nans: int | None
Number of samples containing missing values.
numerical: series
Names of the numerical features in the dataset.
n_numerical: int
Number of numerical features in the dataset.
categorical: series
Names of the categorical features in the dataset.
n_categorical: int
Number of categorical features in the dataset.
outliers: series | None
Columns in training set with amount of outlier values.
n_outliers: int | None
Number of samples in the training set containing outliers.
+missing since they are incompatible with sklearn estimators.nans: series | None
Columns with the number of missing values in them.
n_nans: int | None
Number of samples containing missing values.
numerical: index
Names of the numerical features in the dataset.
n_numerical: int
Number of numerical features in the dataset.
categorical: index
Names of the categorical features in the dataset.
n_categorical: int
Number of categorical features in the dataset.
outliers: pd.Series | None
Columns in training set with amount of outlier values.
n_outliers: int | None
Number of samples in the training set containing outliers.
Utility attributes
@@ -3644,7 +3643,7 @@ Plot attributes
The plot attributes are used to customize the plot's aesthetics. Read
more in the user guide.
-Attributes | palette: str | SEQUENCE
Color palette.
+ Attributes | palette: str | sequence
Color palette.
Specify one of plotly's built-in palettes or create
a custom one, e.g. atom.palette = ["red", "green", "blue"] . title_fontsize: int
Fontsize for the plot's title. label_fontsize: int
Fontsize for the labels, legend and hover information. tick_fontsize: int
Fontsize for the ticks along the plot's axes. line_width: int
Width of the line plots. marker_size: int
Size of the markers. |
|
@@ -3656,7 +3655,7 @@ Utility methods
add | Add a transformer to the pipeline. |
apply | Apply a function to the dataset. |
automl | Search for an optimized pipeline in an automated fashion. |
available_models | Give an overview of the available predefined models. |
canvas | Create a figure with multiple plots. |
clear | Reset attributes and clear cache from all models. |
delete | Delete models. |
distribution | Get statistics on column distributions. |
eda | Create an Exploratory Data Analysis report. |
evaluate | Get all models' scores for the provided metrics. |
export_pipeline | Export the pipeline to a sklearn-like object. |
get_class_weight | Return class weights for a balanced data set. |
get_sample_weight | Return sample weights for a balanced data set. |
inverse_transform | Inversely transform new data through the pipeline. |
load | Loads an atom instance from a pickle file. |
log | Print message and save to log file. |
merge | Merge another instance of the same class into this one. |
update_layout | Update the properties of the plot's layout. |
update_traces | Update the properties of the plot's traces. |
reset | Reset the instance to it's initial state. |
reset_aesthetics | Reset the plot aesthetics to their default values. |
save | Save the instance to a pickle file. |
save_data | Save the data in the current branch to a .csv file. |
shrink | Converts the columns to the smallest possible matching dtype. |
stacking | Add a Stacking model to the pipeline. |
stats | Display basic information about the dataset. |
status | Get an overview of the branches and models. |
transform | Transform new data through the pipeline. |
voting | Add a Voting model to the pipeline. |
-
method add(transformer, columns=None, train_only=False, **fit_params)
[source] Add a transformer to the pipeline.
+
method add(transformer, columns=None, train_only=False, **fit_params)
[source] Add a transformer to the pipeline.
If the transformer is not fitted, it is fitted on the complete
training set. Afterwards, the data set is transformed and the
estimator is added to atom's pipeline. If the estimator is
@@ -3719,10 +3718,9 @@
Utility methods
-
method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)
[source] Apply a function to the dataset.
-The function should have signature func(dataset, **kw_args) ->
-dataset
. This method is useful for stateless transformations
-such as taking the log, doing custom scaling, etc...
+
method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)
[source] Apply a function to the dataset.
+This method is useful for stateless transformations such as
+taking the log, doing custom scaling, etc...
Note
This approach is preferred over changing the dataset directly
@@ -3735,7 +3733,8 @@
Utility methods
Parameters | func: callable
-Function to apply.
+Function to apply with signature func(dataset, **kw_args) ->
+dataset .
inverse_func: callable or None, default=None
Inverse function of func . If None, the inverse_transform
method returns the input unchanged.
@@ -3746,7 +3745,7 @@ Utility methods
|
-
Search for an optimized pipeline in an automated fashion.
+
Search for an optimized pipeline in an automated fashion.
Automated machine learning (AutoML) automates the selection,
composition and parameterization of machine learning pipelines.
Automating the machine learning often provides faster, more
@@ -3768,7 +3767,7 @@
Utility methods
-
Give an overview of the available predefined models.
+
Give an overview of the available predefined models.
Returns | pd.DataFrame
Information about the available predefined models. Columns
@@ -3790,7 +3789,7 @@ Utility methods
|
-
method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)
[source] Create a figure with multiple plots.
+
method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)
[source] Create a figure with multiple plots.
This @contextmanager
allows you to draw many plots in one
figure. The default option is to add two plots side by side.
See the user guide for an example.
@@ -3835,7 +3834,7 @@ Utility methods
-
Reset attributes and clear cache from all models.
+
Reset attributes and clear cache from all models.
Reset certain model attributes to their initial state, deleting
potentially large data arrays. Use this method to free some
memory before saving the instance. The affected
@@ -3850,7 +3849,7 @@
Utility methods
- Cached holdout data sets
-
Delete models.
+
Delete models.
If all models are removed, the metric is reset. Use this method
to drop unwanted models from the pipeline or to free some memory
before saving. Deleted models are not removed from
@@ -3861,7 +3860,7 @@
Utility methods
-
method distribution(distributions=None, columns=None)
[source] Get statistics on column distributions.
+
method distribution(distributions=None, columns=None)
[source] Get statistics on column distributions.
Compute the Kolmogorov-Smirnov test for various
distributions against columns in the dataset. Only for numerical
columns. Missing values are ignored.
@@ -3891,7 +3890,7 @@ Utility methods
-
method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)
[source] Create an Exploratory Data Analysis report.
+
method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)
[source] Create an Exploratory Data Analysis report.
ATOM uses the ydata-profiling package for the EDA.
The report is rendered directly in the notebook. The created
ProfileReport instance can be accessed through the report
@@ -3915,7 +3914,7 @@
Utility methods
-
method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)
[source] Get all models' scores for the provided metrics.
+
method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)
[source] Get all models' scores for the provided metrics.
Parameters | metric: str, func, scorer, sequence or None, default=None
Metric to calculate. If None, it returns an overview of
@@ -3942,7 +3941,7 @@ Utility methods
|
-
method export_pipeline(model=None, memory=None, verbose=None)
[source] Export the pipeline to a sklearn-like object.
+
method export_pipeline(model=None, memory=None, verbose=None)
[source] Export the pipeline to a sklearn-like object.
Optionally, you can add a model as final estimator. The
returned pipeline is already fitted on the training set.
@@ -3981,7 +3980,7 @@
Utility methods
-
method get_class_weight(dataset="train")
[source] Return class weights for a balanced data set.
+
method get_class_weight(dataset="train")
[source] Return class weights for a balanced data set.
Statistically, the class weights re-balance the data set so
that the sampled data set represents the target population
as closely as possible. The returned weights are inversely
@@ -3996,7 +3995,7 @@
Utility methods
-
method get_sample_weight(dataset="train")
[source] Return sample weights for a balanced data set.
+
method get_sample_weight(dataset="train")
[source] Return sample weights for a balanced data set.
The returned weights are inversely proportional to the class
frequencies in the selected data set. For multioutput tasks,
the weights of each column of y
will be multiplied.
@@ -4009,7 +4008,7 @@ Utility methods
-
method inverse_transform(X=None, y=None, verbose=None)
[source] Inversely transform new data through the pipeline.
+
method inverse_transform(X=None, y=None, verbose=None)
[source] Inversely transform new data through the pipeline.
Transformers that are only applied on the training set are
skipped. The rest should all implement a inverse_transform
method. If only X
or only y
is provided, it ignores
@@ -4039,7 +4038,7 @@
Utility methods
-
function atom.atom.
load(filename, data=None, transform_data=True, verbose=None)
[source] Loads an atom instance from a pickle file.
+
function atom.atom.
load(filename, data=None, transform_data=True, verbose=None)
[source] Loads an atom instance from a pickle file.
If the instance was saved using save_data=False
,
it's possible to load new data into it and apply all data
transformations.
@@ -4089,7 +4088,7 @@ Utility methods
-
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
+
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
Parameters | msg: int, float or str
Message to save to the logger and print to stdout.
@@ -4101,7 +4100,7 @@ Utility methods
|
-
Merge another instance of the same class into this one.
+
Merge another instance of the same class into this one.
Branches, models, metrics and attributes of the other instance
are merged into this one. If there are branches and/or models
with the same name, they are merged adding the suffix
@@ -4119,7 +4118,7 @@
Utility methods
-
Update the properties of the plot's layout.
+
Update the properties of the plot's layout.
Recursively update the structure of the original layout with
the values in the arguments.
@@ -4128,7 +4127,7 @@ Utility methods
-
Update the properties of the plot's traces.
+
Update the properties of the plot's traces.
Recursively update the structure of the original traces with
the values in the arguments.
@@ -4137,13 +4136,13 @@ Utility methods
-
Reset the instance to it's initial state.
+
Reset the instance to it's initial state.
Deletes all branches and models. The dataset is also reset
to its form after initialization.
-
Reset the plot aesthetics to their default values.
+
Reset the plot aesthetics to their default values.
-
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
+
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -4154,7 +4153,7 @@ Utility methods
|
-
method save_data(filename="auto", dataset="dataset", **kwargs)
[source] Save the data in the current branch to a .csv
file.
+
method save_data(filename="auto", dataset="dataset", **kwargs)
[source] Save the data in the current branch to a .csv
file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -4165,7 +4164,7 @@ Utility methods
|
-
method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)
[source] Converts the columns to the smallest possible matching dtype.
+
method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)
[source] Converts the columns to the smallest possible matching dtype.
Parameters | int2bool: bool, default=False
Whether to convert int columns to bool type. Only if the
@@ -4186,7 +4185,7 @@ Utility methods
|
-
method stacking(models=None, name="Stack", **kwargs)
[source] Add a Stacking model to the pipeline.
+
method stacking(models=None, name="Stack", **kwargs)
[source] Add a Stacking model to the pipeline.
Warning
Combining models trained on different branches into one
@@ -4206,18 +4205,18 @@
Utility methods
-
Display basic information about the dataset.
+
Display basic information about the dataset.
Parameters | _vb: int, default=-2
Internal parameter to always print if called by user.
|
-
Get an overview of the branches and models.
+
Get an overview of the branches and models.
This method prints the same information as the __repr__ and
also saves it to the logger.
-
method transform(X=None, y=None, verbose=None)
[source] Transform new data through the pipeline.
+
method transform(X=None, y=None, verbose=None)
[source] Transform new data through the pipeline.
Transformers that are only applied on the training set are
skipped. If only X
or only y
is provided, it ignores
transformers that require the other parameter. This can be
@@ -4247,7 +4246,7 @@
Utility methods
-
method voting(models=None, name="Vote", **kwargs)
[source] Add a Voting model to the pipeline.
+
method voting(models=None, name="Vote", **kwargs)
[source] Add a Voting model to the pipeline.
Warning
Combining models trained on different branches into one
@@ -4280,7 +4279,7 @@
Data cleaning
clean | Applies standard data cleaning steps on the dataset. |
discretize | Bin continuous data into intervals. |
encode | Perform encoding of categorical features. |
impute | Handle missing values in the dataset. |
normalize | Transform the data to follow a Normal/Gaussian distribution. |
prune | Prune outliers from the training set. |
scale | Scale the data. |
-
method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)
[source] Applies standard data cleaning steps on the dataset.
+
method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)
[source] Applies standard data cleaning steps on the dataset.
Use the parameters to choose which transformations to perform.
The available steps are:
@@ -4294,7 +4293,7 @@ Data cleaning
See the Cleaner class for a description of the parameters.
-
method discretize(strategy="quantile", bins=5, labels=None, **kwargs)
[source] Bin continuous data into intervals.
+
method discretize(strategy="quantile", bins=5, labels=None, **kwargs)
[source] Bin continuous data into intervals.
For each feature, the bin edges are computed during fit
and, together with the number of bins, they will define the
intervals. Ignores numerical columns.
@@ -4305,7 +4304,7 @@
Data cleaning
distribution and decide on the bins.
-
method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)
[source] Perform encoding of categorical features.
+
method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)
[source] Perform encoding of categorical features.
The encoding type depends on the number of classes in the
column:
@@ -4330,7 +4329,7 @@ Data cleaning
list of the categorical features in the dataset.
-
method impute(strat_num="drop", strat_cat="drop", max_nan_rows=None, max_nan_cols=None, **kwargs)
[source] Handle missing values in the dataset.
+
method impute(strat_num="drop", strat_cat="drop", max_nan_rows=None, max_nan_cols=None, **kwargs)
[source] Handle missing values in the dataset.
Impute or remove missing values according to the selected
strategy. Also removes rows and columns with too many missing
values. Use the missing
attribute to customize what are
@@ -4342,7 +4341,7 @@
Data cleaning
missing values per column.
-
method normalize(strategy="yeojohnson", **kwargs)
[source] Transform the data to follow a Normal/Gaussian distribution.
+
method normalize(strategy="yeojohnson", **kwargs)
[source] Transform the data to follow a Normal/Gaussian distribution.
This transformation is useful for modeling issues related
to heteroscedasticity (non-constant variance), or other
situations where normality is desired. Missing values are
@@ -4355,7 +4354,7 @@
Data cleaning
distribution.
-
method prune(strategy="zscore", method="drop", max_sigma=3, include_target=False, **kwargs)
[source] Prune outliers from the training set.
+
method prune(strategy="zscore", method="drop", max_sigma=3, include_target=False, **kwargs)
[source] Prune outliers from the training set.
Replace or remove outliers. The definition of outlier depends
on the selected strategy and can greatly differ from one
another. Ignores categorical columns.
@@ -4372,7 +4371,7 @@ Data cleaning
number of outliers per column.
-
method scale(strategy="standard", include_binary=False, **kwargs)
[source] Scale the data.
+
method scale(strategy="standard", include_binary=False, **kwargs)
[source] Scale the data.
Apply one of sklearn's scalers. Categorical columns are ignored.
See the Scaler class for a description of the parameters.
@@ -4391,7 +4390,7 @@
NLP
textclean | Applies standard text cleaning to the corpus. |
textnormalize | Normalize the corpus. |
tokenize | Tokenize the corpus. |
vectorize | Vectorize the corpus. |
-
method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)
[source] Applies standard text cleaning to the corpus.
+
method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)
[source] Applies standard text cleaning to the corpus.
Transformations include normalizing characters and dropping
noise from the text (emails, HTML tags, URLs, etc...). The
transformations are applied on the column named corpus
, in
@@ -4400,7 +4399,7 @@
NLP
See the TextCleaner class for a description of the
parameters.
-
method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)
[source] Normalize the corpus.
+
method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)
[source] Normalize the corpus.
Convert words to a more uniform standard. The transformations
are applied on the column named corpus
, in the same order the
parameters are presented. If there is no column with that name,
@@ -4409,7 +4408,7 @@
NLP
See the TextNormalizer class for a description of the
parameters.
-
method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)
[source] Tokenize the corpus.
+
method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)
[source] Tokenize the corpus.
Convert documents into sequences of words. Additionally,
create n-grams (represented by words united with underscores,
e.g. "New_York") based on their frequency in the corpus. The
@@ -4417,7 +4416,7 @@
NLP
there is no column with that name, an exception is raised.
See the Tokenizer class for a description of the parameters.
-
method vectorize(strategy="bow", return_sparse=True, **kwargs)
[source] Vectorize the corpus.
+
method vectorize(strategy="bow", return_sparse=True, **kwargs)
[source] Vectorize the corpus.
Transform the corpus into meaningful vectors of numbers. The
transformation is applied on the column named corpus
. If
there is no column with that name, an exception is raised.
@@ -4438,7 +4437,7 @@
Feature engineering
feature_extraction | Extract features from datetime columns. |
feature_generation | Generate new features. |
feature_grouping | Extract statistics from similar features. |
feature_selection | Reduce the number of features in the data. |
-
method feature_extraction(features=['day', 'month', 'year'], fmt=None, encoding_type="ordinal", drop_columns=True, **kwargs)
[source] Extract features from datetime columns.
+
method feature_extraction(features=['day', 'month', 'year'], fmt=None, encoding_type="ordinal", drop_columns=True, **kwargs)
[source] Extract features from datetime columns.
Create new features extracting datetime elements (day, month,
year, etc...) from the provided columns. Columns of dtype
datetime64
are used as is. Categorical columns that can be
@@ -4447,13 +4446,13 @@
Feature engineering
See the FeatureExtractor class for a description of the
parameters.
-
method feature_generation(strategy="dfs", n_features=None, operators=None, **kwargs)
[source] Generate new features.
+
method feature_generation(strategy="dfs", n_features=None, operators=None, **kwargs)
[source] Generate new features.
Create new combinations of existing features to capture the
non-linear relations between the original features.
See the FeatureGenerator class for a description of the
parameters.
-
method feature_grouping(group, operators=None, drop_columns=True, **kwargs)
[source] Extract statistics from similar features.
+
method feature_grouping(group, operators=None, drop_columns=True, **kwargs)
[source] Extract statistics from similar features.
Replace groups of features with related characteristics with new
features that summarize statistical properties of te group. The
statistical operators are calculated over every row of the group.
@@ -4462,7 +4461,7 @@
Feature engineering
See the FeatureGrouper class for a description of the
parameters.
-
method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)
[source] Reduce the number of features in the data.
+
method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)
[source] Reduce the number of features in the data.
Apply feature selection or dimensionality reduction, either to
improve the estimators' accuracy or to boost their performance
on very high-dimensional datasets. Additionally, remove
@@ -4490,7 +4489,7 @@
Training
run | Train and evaluate the models in a direct fashion. |
successive_halving | Fit the models in a successive halving fashion. |
train_sizing | Train and evaluate the models in a train sizing fashion. |
-
method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a direct fashion.
+
method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a direct fashion.
Contrary to successive_halving and
train_sizing, the direct approach only
iterates once over the models, using the full dataset.
@@ -4507,7 +4506,7 @@
Training
See the DirectClassifier or DirectRegressor class for a
description of the parameters.
-
method successive_halving(models, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Fit the models in a successive halving fashion.
+
method successive_halving(models, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Fit the models in a successive halving fashion.
The successive halving technique is a bandit-based algorithm
that fits N models to 1/N of the data. The best half are
selected to go to the next iteration where the process is
@@ -4530,7 +4529,7 @@
Training
See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor
class for a description of the parameters.
-
method train_sizing(models, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a train sizing fashion.
+
method train_sizing(models, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a train sizing fashion.
When training models, there is usually a trade-off between
model performance and computation time, that is regulated by
the number of samples in the training set. This method can be
diff --git a/docs/API/ATOM/atommodel/index.html b/docs/API/ATOM/atommodel/index.html
index ddd7a11d9..5e9cb891d 100644
--- a/docs/API/ATOM/atommodel/index.html
+++ b/docs/API/ATOM/atommodel/index.html
@@ -1205,7 +1205,7 @@
-
- DirectRegressor
+ DirectForecaster
@@ -1247,7 +1247,7 @@
-
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1289,7 +1289,7 @@
-
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3226,7 +3226,7 @@
ATOMModel
-
function atom.api.
ATOMModel(estimator, name=None, acronym=None, needs_scaling=False, native_multilabel=False, native_multioutput=False, has_validation=None)
[source] Convert an estimator to a model that can be ingested by atom.
+
function atom.api.
ATOMModel(estimator, name=None, acronym=None, needs_scaling=False, native_multilabel=False, native_multioutput=False, has_validation=None)
[source] Convert an estimator to a model that can be ingested by atom.
This function adds the relevant attributes to the estimator so
that they can be used by atom. Note that only estimators that follow
sklearn's API are compatible.
diff --git a/docs/API/ATOM/atomregressor/index.html b/docs/API/ATOM/atomregressor/index.html
index f6014ecf6..2abe31e14 100644
--- a/docs/API/ATOM/atomregressor/index.html
+++ b/docs/API/ATOM/atomregressor/index.html
@@ -1288,7 +1288,7 @@
-
- DirectRegressor
+ DirectForecaster
@@ -1330,7 +1330,7 @@
-
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1372,7 +1372,7 @@
-
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3392,7 +3392,7 @@
ATOMRegressor
-
class atom.api.
ATOMRegressor(*arrays, y=-1, index=False, shuffle=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device="cpu", engine=None, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)
[source] Main class for regression tasks.
+
class atom.api.
ATOMRegressor(*arrays, y=-1, index=False, shuffle=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)
[source] Main class for regression tasks.
Apply all data transformations and model management provided by
the package on a given dataset. Note that, contrary to sklearn's
API, the instance contains the dataset on which to perform the
@@ -3484,17 +3484,16 @@
ATOMRegressor
follows the
SYCL_DEVICE_FILTER filter selector, e.g.
device="gpu"
to use the GPU. Read more in the
user guide.
-
engine: dict or None, default=None
engine: dict, default={"data": "numpy", "estimator": "sklearn"}
Execution engine to use for
data and
estimators. The value should be a
dictionary with keys
data
and/or
estimator
, with their
-corresponding choice as values. If None, the default options
-are selected. Choose from:
+corresponding choice as values. Choose from:
-
"data":
-- "numpy" (default)
+- "numpy"
- "pyarrow"
- "modin"
@@ -3502,7 +3501,7 @@ ATOMRegressor
-
"estimator":
-- "sklearn" (default)
+- "sklearn"
- "sklearnex"
- "cuml"
@@ -3599,15 +3598,15 @@ Data attributes
visualize the pipeline, use the plot_pipeline method.
mapping: dict
Encoded values and their respective mapped values.
The column name is the key to its mapping dictionary. Only for
columns mapped to a single column (e.g. Ordinal, Leave-one-out,
-etc...).
dataset: dataframe
Complete data set.
train: dataframe
Training set.
test: dataframe
Test set.
X: dataframe
Feature set.
y: series | dataframe
Target column(s).
X_train: dataframe
Features of the training set.
y_train: series | dataframe
Target column(s) of the training set.
X_test: dataframe
Features of the test set.
y_test: series | dataframe
Target column(s) of the test set.
shape: tuple[int, int]
Shape of the dataset (n_rows, n_columns).
columns: series
Name of all the columns.
n_columns: int
Number of columns.
features: series
Name of the features.
n_features: int
Number of features.
target: str | list[str]
Name of the target column(s).
scaled: bool
Whether the feature set is scaled.
+etc...).
dataset: dataframe
Complete data set.
train: dataframe
Training set.
test: dataframe
Test set.
X: dataframe
Feature set.
y: series | dataframe
Target column(s).
X_train: dataframe
Features of the training set.
y_train: series | dataframe
Target column(s) of the training set.
X_test: dataframe
Features of the test set.
y_test: series | dataframe
Target column(s) of the test set.
shape: tuple[int, int]
Shape of the dataset (n_rows, n_columns).
columns: index
Name of all the columns.
n_columns: int
Number of columns.
features: index
Name of the features.
n_features: int
Number of features.
target: str | list[str]
Name of the target column(s).
scaled: bool
Whether the feature set is scaled.
A data set is considered scaled when it has mean=0 and std=1,
or when there is a scaler in the pipeline. Binary columns (only
-0s and 1s) are excluded from the calculation.
duplicates: series
Number of duplicate rows in the dataset.
missing: list
Values that are considered "missing".
+0s and 1s) are excluded from the calculation.
duplicates: int
Number of duplicate rows in the dataset.
missing: list
Values that are considered "missing".
These values are used by the clean and
impute methods. Default values are: None, NaN,
NaT, +inf, -inf, "", "?", "None", "NA", "nan", "NaN", "NaT",
"inf". Note that None, NaN, +inf and -inf are always considered
-missing since they are incompatible with sklearn estimators.
nans: series | None
Columns with the number of missing values in them.
n_nans: int | None
Number of samples containing missing values.
numerical: series
Names of the numerical features in the dataset.
n_numerical: int
Number of numerical features in the dataset.
categorical: series
Names of the categorical features in the dataset.
n_categorical: int
Number of categorical features in the dataset.
outliers: series | None
Columns in training set with amount of outlier values.
n_outliers: int | None
Number of samples in the training set containing outliers.
+missing since they are incompatible with sklearn estimators.nans: series | None
Columns with the number of missing values in them.
n_nans: int | None
Number of samples containing missing values.
numerical: index
Names of the numerical features in the dataset.
n_numerical: int
Number of numerical features in the dataset.
categorical: index
Names of the categorical features in the dataset.
n_categorical: int
Number of categorical features in the dataset.
outliers: pd.Series | None
Columns in training set with amount of outlier values.
n_outliers: int | None
Number of samples in the training set containing outliers.
Utility attributes
@@ -3657,7 +3656,7 @@ Plot attributes
The plot attributes are used to customize the plot's aesthetics. Read
more in the user guide.
-Attributes | palette: str | SEQUENCE
Color palette.
+ Attributes | palette: str | sequence
Color palette.
Specify one of plotly's built-in palettes or create
a custom one, e.g. atom.palette = ["red", "green", "blue"] . title_fontsize: int
Fontsize for the plot's title. label_fontsize: int
Fontsize for the labels, legend and hover information. tick_fontsize: int
Fontsize for the ticks along the plot's axes. line_width: int
Width of the line plots. marker_size: int
Size of the markers. |
|
@@ -3669,7 +3668,7 @@ Utility methods
add | Add a transformer to the pipeline. |
apply | Apply a function to the dataset. |
automl | Search for an optimized pipeline in an automated fashion. |
available_models | Give an overview of the available predefined models. |
canvas | Create a figure with multiple plots. |
clear | Reset attributes and clear cache from all models. |
delete | Delete models. |
distribution | Get statistics on column distributions. |
eda | Create an Exploratory Data Analysis report. |
evaluate | Get all models' scores for the provided metrics. |
export_pipeline | Export the pipeline to a sklearn-like object. |
get_class_weight | Return class weights for a balanced data set. |
get_sample_weight | Return sample weights for a balanced data set. |
inverse_transform | Inversely transform new data through the pipeline. |
load | Loads an atom instance from a pickle file. |
log | Print message and save to log file. |
merge | Merge another instance of the same class into this one. |
update_layout | Update the properties of the plot's layout. |
update_traces | Update the properties of the plot's traces. |
reset | Reset the instance to it's initial state. |
reset_aesthetics | Reset the plot aesthetics to their default values. |
save | Save the instance to a pickle file. |
save_data | Save the data in the current branch to a .csv file. |
shrink | Converts the columns to the smallest possible matching dtype. |
stacking | Add a Stacking model to the pipeline. |
stats | Display basic information about the dataset. |
status | Get an overview of the branches and models. |
transform | Transform new data through the pipeline. |
voting | Add a Voting model to the pipeline. |
-
method add(transformer, columns=None, train_only=False, **fit_params)
[source] Add a transformer to the pipeline.
+
method add(transformer, columns=None, train_only=False, **fit_params)
[source] Add a transformer to the pipeline.
If the transformer is not fitted, it is fitted on the complete
training set. Afterwards, the data set is transformed and the
estimator is added to atom's pipeline. If the estimator is
@@ -3732,10 +3731,9 @@
Utility methods
-
method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)
[source] Apply a function to the dataset.
-The function should have signature func(dataset, **kw_args) ->
-dataset
. This method is useful for stateless transformations
-such as taking the log, doing custom scaling, etc...
+
method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)
[source] Apply a function to the dataset.
+This method is useful for stateless transformations such as
+taking the log, doing custom scaling, etc...
Note
This approach is preferred over changing the dataset directly
@@ -3748,7 +3746,8 @@
Utility methods
Parameters | func: callable
-Function to apply.
+Function to apply with signature func(dataset, **kw_args) ->
+dataset .
inverse_func: callable or None, default=None
Inverse function of func . If None, the inverse_transform
method returns the input unchanged.
@@ -3759,7 +3758,7 @@ Utility methods
|
-
Search for an optimized pipeline in an automated fashion.
+
Search for an optimized pipeline in an automated fashion.
Automated machine learning (AutoML) automates the selection,
composition and parameterization of machine learning pipelines.
Automating the machine learning often provides faster, more
@@ -3781,7 +3780,7 @@
Utility methods
-
Give an overview of the available predefined models.
+
Give an overview of the available predefined models.
Returns | pd.DataFrame
Information about the available predefined models. Columns
@@ -3803,7 +3802,7 @@ Utility methods
|
-
method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)
[source] Create a figure with multiple plots.
+
method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)
[source] Create a figure with multiple plots.
This @contextmanager
allows you to draw many plots in one
figure. The default option is to add two plots side by side.
See the user guide for an example.
@@ -3848,7 +3847,7 @@ Utility methods
-
Reset attributes and clear cache from all models.
+
Reset attributes and clear cache from all models.
Reset certain model attributes to their initial state, deleting
potentially large data arrays. Use this method to free some
memory before saving the instance. The affected
@@ -3863,7 +3862,7 @@
Utility methods
- Cached holdout data sets
-
Delete models.
+
Delete models.
If all models are removed, the metric is reset. Use this method
to drop unwanted models from the pipeline or to free some memory
before saving. Deleted models are not removed from
@@ -3874,7 +3873,7 @@
Utility methods
-
method distribution(distributions=None, columns=None)
[source] Get statistics on column distributions.
+
method distribution(distributions=None, columns=None)
[source] Get statistics on column distributions.
Compute the Kolmogorov-Smirnov test for various
distributions against columns in the dataset. Only for numerical
columns. Missing values are ignored.
@@ -3904,7 +3903,7 @@ Utility methods
-
method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)
[source] Create an Exploratory Data Analysis report.
+
method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)
[source] Create an Exploratory Data Analysis report.
ATOM uses the ydata-profiling package for the EDA.
The report is rendered directly in the notebook. The created
ProfileReport instance can be accessed through the report
@@ -3928,7 +3927,7 @@
Utility methods
-
method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)
[source] Get all models' scores for the provided metrics.
+
method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)
[source] Get all models' scores for the provided metrics.
Parameters | metric: str, func, scorer, sequence or None, default=None
Metric to calculate. If None, it returns an overview of
@@ -3955,7 +3954,7 @@ Utility methods
|
-
method export_pipeline(model=None, memory=None, verbose=None)
[source] Export the pipeline to a sklearn-like object.
+
method export_pipeline(model=None, memory=None, verbose=None)
[source] Export the pipeline to a sklearn-like object.
Optionally, you can add a model as final estimator. The
returned pipeline is already fitted on the training set.
@@ -3994,7 +3993,7 @@
Utility methods
-
method get_class_weight(dataset="train")
[source] Return class weights for a balanced data set.
+
method get_class_weight(dataset="train")
[source] Return class weights for a balanced data set.
Statistically, the class weights re-balance the data set so
that the sampled data set represents the target population
as closely as possible. The returned weights are inversely
@@ -4009,7 +4008,7 @@
Utility methods
-
method get_sample_weight(dataset="train")
[source] Return sample weights for a balanced data set.
+
method get_sample_weight(dataset="train")
[source] Return sample weights for a balanced data set.
The returned weights are inversely proportional to the class
frequencies in the selected data set. For multioutput tasks,
the weights of each column of y
will be multiplied.
@@ -4022,7 +4021,7 @@ Utility methods
-
method inverse_transform(X=None, y=None, verbose=None)
[source] Inversely transform new data through the pipeline.
+
method inverse_transform(X=None, y=None, verbose=None)
[source] Inversely transform new data through the pipeline.
Transformers that are only applied on the training set are
skipped. The rest should all implement a inverse_transform
method. If only X
or only y
is provided, it ignores
@@ -4052,7 +4051,7 @@
Utility methods
-
function atom.atom.
load(filename, data=None, transform_data=True, verbose=None)
[source] Loads an atom instance from a pickle file.
+
function atom.atom.
load(filename, data=None, transform_data=True, verbose=None)
[source] Loads an atom instance from a pickle file.
If the instance was saved using save_data=False
,
it's possible to load new data into it and apply all data
transformations.
@@ -4102,7 +4101,7 @@ Utility methods
-
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
+
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
Parameters | msg: int, float or str
Message to save to the logger and print to stdout.
@@ -4114,7 +4113,7 @@ Utility methods
|
-
Merge another instance of the same class into this one.
+
Merge another instance of the same class into this one.
Branches, models, metrics and attributes of the other instance
are merged into this one. If there are branches and/or models
with the same name, they are merged adding the suffix
@@ -4132,7 +4131,7 @@
Utility methods
-
Update the properties of the plot's layout.
+
Update the properties of the plot's layout.
Recursively update the structure of the original layout with
the values in the arguments.
@@ -4141,7 +4140,7 @@ Utility methods
-
Update the properties of the plot's traces.
+
Update the properties of the plot's traces.
Recursively update the structure of the original traces with
the values in the arguments.
@@ -4150,13 +4149,13 @@ Utility methods
-
Reset the instance to it's initial state.
+
Reset the instance to it's initial state.
Deletes all branches and models. The dataset is also reset
to its form after initialization.
-
Reset the plot aesthetics to their default values.
+
Reset the plot aesthetics to their default values.
-
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
+
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -4167,7 +4166,7 @@ Utility methods
|
-
method save_data(filename="auto", dataset="dataset", **kwargs)
[source] Save the data in the current branch to a .csv
file.
+
method save_data(filename="auto", dataset="dataset", **kwargs)
[source] Save the data in the current branch to a .csv
file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -4178,7 +4177,7 @@ Utility methods
|
-
method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)
[source] Converts the columns to the smallest possible matching dtype.
+
method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)
[source] Converts the columns to the smallest possible matching dtype.
Parameters | int2bool: bool, default=False
Whether to convert int columns to bool type. Only if the
@@ -4199,7 +4198,7 @@ Utility methods
|
-
method stacking(models=None, name="Stack", **kwargs)
[source] Add a Stacking model to the pipeline.
+
method stacking(models=None, name="Stack", **kwargs)
[source] Add a Stacking model to the pipeline.
Warning
Combining models trained on different branches into one
@@ -4219,18 +4218,18 @@
Utility methods
-
Display basic information about the dataset.
+
Display basic information about the dataset.
Parameters | _vb: int, default=-2
Internal parameter to always print if called by user.
|
-
Get an overview of the branches and models.
+
Get an overview of the branches and models.
This method prints the same information as the __repr__ and
also saves it to the logger.
-
method transform(X=None, y=None, verbose=None)
[source] Transform new data through the pipeline.
+
method transform(X=None, y=None, verbose=None)
[source] Transform new data through the pipeline.
Transformers that are only applied on the training set are
skipped. If only X
or only y
is provided, it ignores
transformers that require the other parameter. This can be
@@ -4260,7 +4259,7 @@
Utility methods
-
method voting(models=None, name="Vote", **kwargs)
[source] Add a Voting model to the pipeline.
+
method voting(models=None, name="Vote", **kwargs)
[source] Add a Voting model to the pipeline.
Warning
Combining models trained on different branches into one
@@ -4293,7 +4292,7 @@
Data cleaning
clean | Applies standard data cleaning steps on the dataset. |
discretize | Bin continuous data into intervals. |
encode | Perform encoding of categorical features. |
impute | Handle missing values in the dataset. |
normalize | Transform the data to follow a Normal/Gaussian distribution. |
prune | Prune outliers from the training set. |
scale | Scale the data. |
-
method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)
[source] Applies standard data cleaning steps on the dataset.
+
method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)
[source] Applies standard data cleaning steps on the dataset.
Use the parameters to choose which transformations to perform.
The available steps are:
@@ -4307,7 +4306,7 @@ Data cleaning
See the Cleaner class for a description of the parameters.
-
method discretize(strategy="quantile", bins=5, labels=None, **kwargs)
[source] Bin continuous data into intervals.
+
method discretize(strategy="quantile", bins=5, labels=None, **kwargs)
[source] Bin continuous data into intervals.
For each feature, the bin edges are computed during fit
and, together with the number of bins, they will define the
intervals. Ignores numerical columns.
@@ -4318,7 +4317,7 @@
Data cleaning
distribution and decide on the bins.
-
method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)
[source] Perform encoding of categorical features.
+
method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)
[source] Perform encoding of categorical features.
The encoding type depends on the number of classes in the
column:
@@ -4343,7 +4342,7 @@ Data cleaning
list of the categorical features in the dataset.
-
method impute(strat_num="drop", strat_cat="drop", max_nan_rows=None, max_nan_cols=None, **kwargs)
[source] Handle missing values in the dataset.
+
method impute(strat_num="drop", strat_cat="drop", max_nan_rows=None, max_nan_cols=None, **kwargs)
[source] Handle missing values in the dataset.
Impute or remove missing values according to the selected
strategy. Also removes rows and columns with too many missing
values. Use the missing
attribute to customize what are
@@ -4355,7 +4354,7 @@
Data cleaning
missing values per column.
-
method normalize(strategy="yeojohnson", **kwargs)
[source] Transform the data to follow a Normal/Gaussian distribution.
+
method normalize(strategy="yeojohnson", **kwargs)
[source] Transform the data to follow a Normal/Gaussian distribution.
This transformation is useful for modeling issues related
to heteroscedasticity (non-constant variance), or other
situations where normality is desired. Missing values are
@@ -4368,7 +4367,7 @@
Data cleaning
distribution.
-
method prune(strategy="zscore", method="drop", max_sigma=3, include_target=False, **kwargs)
[source] Prune outliers from the training set.
+
method prune(strategy="zscore", method="drop", max_sigma=3, include_target=False, **kwargs)
[source] Prune outliers from the training set.
Replace or remove outliers. The definition of outlier depends
on the selected strategy and can greatly differ from one
another. Ignores categorical columns.
@@ -4385,7 +4384,7 @@ Data cleaning
number of outliers per column.
-
method scale(strategy="standard", include_binary=False, **kwargs)
[source] Scale the data.
+
method scale(strategy="standard", include_binary=False, **kwargs)
[source] Scale the data.
Apply one of sklearn's scalers. Categorical columns are ignored.
See the Scaler class for a description of the parameters.
@@ -4404,7 +4403,7 @@
NLP
textclean | Applies standard text cleaning to the corpus. |
textnormalize | Normalize the corpus. |
tokenize | Tokenize the corpus. |
vectorize | Vectorize the corpus. |
-
method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)
[source] Applies standard text cleaning to the corpus.
+
method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)
[source] Applies standard text cleaning to the corpus.
Transformations include normalizing characters and dropping
noise from the text (emails, HTML tags, URLs, etc...). The
transformations are applied on the column named corpus
, in
@@ -4413,7 +4412,7 @@
NLP
See the TextCleaner class for a description of the
parameters.
-
method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)
[source] Normalize the corpus.
+
method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)
[source] Normalize the corpus.
Convert words to a more uniform standard. The transformations
are applied on the column named corpus
, in the same order the
parameters are presented. If there is no column with that name,
@@ -4422,7 +4421,7 @@
NLP
See the TextNormalizer class for a description of the
parameters.
-
method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)
[source] Tokenize the corpus.
+
method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)
[source] Tokenize the corpus.
Convert documents into sequences of words. Additionally,
create n-grams (represented by words united with underscores,
e.g. "New_York") based on their frequency in the corpus. The
@@ -4430,7 +4429,7 @@
NLP
there is no column with that name, an exception is raised.
See the Tokenizer class for a description of the parameters.
-
method vectorize(strategy="bow", return_sparse=True, **kwargs)
[source] Vectorize the corpus.
+
method vectorize(strategy="bow", return_sparse=True, **kwargs)
[source] Vectorize the corpus.
Transform the corpus into meaningful vectors of numbers. The
transformation is applied on the column named corpus
. If
there is no column with that name, an exception is raised.
@@ -4451,7 +4450,7 @@
Feature engineering
feature_extraction | Extract features from datetime columns. |
feature_generation | Generate new features. |
feature_grouping | Extract statistics from similar features. |
feature_selection | Reduce the number of features in the data. |
-
method feature_extraction(features=['day', 'month', 'year'], fmt=None, encoding_type="ordinal", drop_columns=True, **kwargs)
[source] Extract features from datetime columns.
+
method feature_extraction(features=['day', 'month', 'year'], fmt=None, encoding_type="ordinal", drop_columns=True, **kwargs)
[source] Extract features from datetime columns.
Create new features extracting datetime elements (day, month,
year, etc...) from the provided columns. Columns of dtype
datetime64
are used as is. Categorical columns that can be
@@ -4460,13 +4459,13 @@
Feature engineering
See the FeatureExtractor class for a description of the
parameters.
-
method feature_generation(strategy="dfs", n_features=None, operators=None, **kwargs)
[source] Generate new features.
+
method feature_generation(strategy="dfs", n_features=None, operators=None, **kwargs)
[source] Generate new features.
Create new combinations of existing features to capture the
non-linear relations between the original features.
See the FeatureGenerator class for a description of the
parameters.
-
method feature_grouping(group, operators=None, drop_columns=True, **kwargs)
[source] Extract statistics from similar features.
+
method feature_grouping(group, operators=None, drop_columns=True, **kwargs)
[source] Extract statistics from similar features.
Replace groups of features with related characteristics with new
features that summarize statistical properties of te group. The
statistical operators are calculated over every row of the group.
@@ -4475,7 +4474,7 @@
Feature engineering
See the FeatureGrouper class for a description of the
parameters.
-
method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)
[source] Reduce the number of features in the data.
+
method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)
[source] Reduce the number of features in the data.
Apply feature selection or dimensionality reduction, either to
improve the estimators' accuracy or to boost their performance
on very high-dimensional datasets. Additionally, remove
@@ -4503,7 +4502,7 @@
Training
run | Train and evaluate the models in a direct fashion. |
successive_halving | Fit the models in a successive halving fashion. |
train_sizing | Train and evaluate the models in a train sizing fashion. |
-
method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a direct fashion.
+
method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a direct fashion.
Contrary to successive_halving and
train_sizing, the direct approach only
iterates once over the models, using the full dataset.
@@ -4520,7 +4519,7 @@
Training
See the DirectClassifier or DirectRegressor class for a
description of the parameters.
-
method successive_halving(models, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Fit the models in a successive halving fashion.
+
method successive_halving(models, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Fit the models in a successive halving fashion.
The successive halving technique is a bandit-based algorithm
that fits N models to 1/N of the data. The best half are
selected to go to the next iteration where the process is
@@ -4543,7 +4542,7 @@
Training
See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor
class for a description of the parameters.
-
method train_sizing(models, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a train sizing fashion.
+
method train_sizing(models, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", **kwargs)
[source] Train and evaluate the models in a train sizing fashion.
When training models, there is usually a trade-off between
model performance and computation time, that is regulated by
the number of samples in the training set. This method can be
diff --git a/docs/API/data_cleaning/balancer/index.html b/docs/API/data_cleaning/balancer/index.html
index 88faab6eb..aee8132eb 100644
--- a/docs/API/data_cleaning/balancer/index.html
+++ b/docs/API/data_cleaning/balancer/index.html
@@ -1212,7 +1212,7 @@
-
- DirectRegressor
+ DirectForecaster
@@ -1254,7 +1254,7 @@
-
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1296,7 +1296,7 @@
-
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3240,7 +3240,7 @@
Balancer
-
class atom.data_cleaning.
Balancer(strategy="ADASYN", n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)
[source] Balance the number of samples per class in the target column.
+
class atom.data_cleaning.
Balancer(strategy="ADASYN", n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)
[source] Balance the number of samples per class in the target column.
When oversampling, the newly created samples have an increasing
integer index for numerical indices, and an index of the form
[estimator]_N for non-numerical indices, where N stands for the
@@ -3339,7 +3339,7 @@
Methods
fit | Does nothing. |
fit_transform | Fit to data, then transform it. |
get_metadata_routing | Get metadata routing of this object. |
get_params | Get parameters for this estimator. |
inverse_transform | Does nothing. |
log | Print message and save to log file. |
save | Save the instance to a pickle file. |
set_params | Set the parameters of this estimator. |
transform | Balance the data. |
-
method fit(X=None, y=None, **fit_params)
[source] Does nothing.
+
method fit(X=None, y=None, **fit_params)
[source] Does nothing.
Implemented for continuity of the API.
Parameters | X: dataframe-like or None, default=None
@@ -3364,7 +3364,7 @@ Methods
|
-
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
+
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3408,7 +3408,7 @@ Methods
|
-
method inverse_transform(X=None, y=None)
[source] Does nothing.
+
method inverse_transform(X=None, y=None)
[source] Does nothing.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3432,7 +3432,7 @@ Methods
|
-
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
+
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
Parameters | msg: int, float or str
Message to save to the logger and print to stdout.
@@ -3444,7 +3444,7 @@ Methods
|
-
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
+
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -3464,7 +3464,7 @@ Methods
|
-
Balance the data.
+
Balance the data.
Parameters | X: dataframe-like
Feature set with shape=(n_samples, n_features).
diff --git a/docs/API/data_cleaning/cleaner/index.html b/docs/API/data_cleaning/cleaner/index.html
index aa370330c..b3bcbcb13 100644
--- a/docs/API/data_cleaning/cleaner/index.html
+++ b/docs/API/data_cleaning/cleaner/index.html
@@ -1212,7 +1212,7 @@
-
- DirectRegressor
+ DirectForecaster
@@ -1254,7 +1254,7 @@
-
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1296,7 +1296,7 @@
-
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3240,7 +3240,7 @@
Cleaner
-
class atom.data_cleaning. Cleaner(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, device="cpu", engine=None, verbose=0, logger=None) [source] Applies standard data cleaning steps on a dataset.
+
class atom.data_cleaning. Cleaner(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None) [source] Applies standard data cleaning steps on a dataset.
Use the parameters to choose which transformations to perform.
The available steps are:
@@ -3280,17 +3280,16 @@ Cleaner
follows the SYCL_DEVICE_FILTER filter selector, e.g.
device="gpu" to use the GPU. Read more in the
user guide.
- engine: dict or None, default=None engine: dict, default={"data": "numpy", "estimator": "sklearn"}
Execution engine to use for data and
estimators. The value should be a
dictionary with keys data and/or estimator , with their
-corresponding choice as values. If None, the default options
-are selected. Choose from:
+corresponding choice as values. Choose from:
-
"data":
-- "numpy" (default)
+- "numpy"
- "pyarrow"
- "modin"
@@ -3298,7 +3297,7 @@ Cleaner
-
"estimator":
-- "sklearn" (default)
+- "sklearn"
- "cuml"
@@ -3378,7 +3377,7 @@ Methods
fit | Fit to data. | fit_transform | Fit to data, then transform it. | get_metadata_routing | Get metadata routing of this object. | get_params | Get parameters for this estimator. | inverse_transform | Inversely transform the label encoding. | log | Print message and save to log file. | save | Save the instance to a pickle file. | set_params | Set the parameters of this estimator. | transform | Apply the data cleaning steps to the data. |
|
-
Fit to data.
+
Fit to data.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3399,7 +3398,7 @@ Methods
|
-
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
+
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3443,7 +3442,7 @@ Methods
|
-
method inverse_transform(X=None, y=None)
[source] Inversely transform the label encoding.
+
method inverse_transform(X=None, y=None)
[source] Inversely transform the label encoding.
This method only inversely transforms the target encoding.
The rest of the transformations can't be inverted. If
encode_target=False
, the data is returned as is.
@@ -3468,7 +3467,7 @@
Methods
-
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
+
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
Parameters | msg: int, float or str
Message to save to the logger and print to stdout.
@@ -3480,7 +3479,7 @@ Methods
|
-
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
+
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -3500,7 +3499,7 @@ Methods
|
-
method transform(X=None, y=None)
[source] Apply the data cleaning steps to the data.
+
method transform(X=None, y=None)
[source] Apply the data cleaning steps to the data.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
diff --git a/docs/API/data_cleaning/discretizer/index.html b/docs/API/data_cleaning/discretizer/index.html
index 2e86a1f49..ab1d8616c 100644
--- a/docs/API/data_cleaning/discretizer/index.html
+++ b/docs/API/data_cleaning/discretizer/index.html
@@ -1212,7 +1212,7 @@
-
- DirectRegressor
+ DirectForecaster
@@ -1254,7 +1254,7 @@
-
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1296,7 +1296,7 @@
-
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3240,7 +3240,7 @@
Discretizer
-
class atom.data_cleaning. Discretizer(strategy="quantile", bins=5, labels=None, device="cpu", engine=None, verbose=0, logger=None, random_state=None) [source] Bin continuous data into intervals.
+
class atom.data_cleaning. Discretizer(strategy="quantile", bins=5, labels=None, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None) [source] Bin continuous data into intervals.
For each feature, the bin edges are computed during fit and,
together with the number of bins, they define the intervals.
Ignores categorical columns.
@@ -3290,17 +3290,16 @@ Discretizer
follows the SYCL_DEVICE_FILTER filter selector, e.g.
device="gpu" to use the GPU. Read more in the
user guide.
- engine: dict or None, default=None engine: dict, default={"data": "numpy", "estimator": "sklearn"}
Execution engine to use for data and
estimators. The value should be a
dictionary with keys data and/or estimator , with their
-corresponding choice as values. If None, the default options
-are selected. Choose from:
+corresponding choice as values. Choose from:
-
"data":
-- "numpy" (default)
+- "numpy"
- "pyarrow"
- "modin"
@@ -3308,7 +3307,7 @@ Discretizer
-
"estimator":
-- "sklearn" (default)
+- "sklearn"
- "cuml"
@@ -3393,7 +3392,7 @@ Methods
fit | Fit to data. | fit_transform | Fit to data, then transform it. | get_metadata_routing | Get metadata routing of this object. | get_params | Get parameters for this estimator. | inverse_transform | Does nothing. | log | Print message and save to log file. | save | Save the instance to a pickle file. | set_params | Set the parameters of this estimator. | transform | Bin the data into intervals. |
|
-
Fit to data.
+
Fit to data.
Parameters | X: dataframe-like
Feature set with shape=(n_samples, n_features).
@@ -3404,7 +3403,7 @@ Methods
|
-
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
+
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3448,7 +3447,7 @@ Methods
|
-
method inverse_transform(X=None, y=None)
[source] Does nothing.
+
method inverse_transform(X=None, y=None)
[source] Does nothing.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3472,7 +3471,7 @@ Methods
|
-
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
+
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
Parameters | msg: int, float or str
Message to save to the logger and print to stdout.
@@ -3484,7 +3483,7 @@ Methods
|
-
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
+
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -3504,7 +3503,7 @@ Methods
|
-
Bin the data into intervals.
+
Bin the data into intervals.
Parameters | X: dataframe-like
Feature set with shape=(n_samples, n_features).
diff --git a/docs/API/data_cleaning/encoder/index.html b/docs/API/data_cleaning/encoder/index.html
index 362ddcf70..4de839743 100644
--- a/docs/API/data_cleaning/encoder/index.html
+++ b/docs/API/data_cleaning/encoder/index.html
@@ -1212,7 +1212,7 @@
-
- DirectRegressor
+ DirectForecaster
@@ -1254,7 +1254,7 @@
-
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1296,7 +1296,7 @@
-
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3240,7 +3240,7 @@
Encoder
-
class atom.data_cleaning. Encoder(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="infrequent", verbose=0, logger=None, **kwargs) [source] Perform encoding of categorical features.
+
class atom.data_cleaning. Encoder(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="infrequent", verbose=0, logger=None, **kwargs) [source] Perform encoding of categorical features.
The encoding type depends on the number of classes in the column:
|
-
Fit to data.
+
Fit to data.
Note that leaving y=None can lead to errors if the strategy
encoder requires target values. For multioutput tasks, only
the first target column is used to fit the encoder.
@@ -3390,7 +3390,7 @@ Methods
-
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
+
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3434,7 +3434,7 @@ Methods
|
-
method inverse_transform(X=None, y=None)
[source] Does nothing.
+
method inverse_transform(X=None, y=None)
[source] Does nothing.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3458,7 +3458,7 @@ Methods
|
-
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
+
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
Parameters | msg: int, float or str
Message to save to the logger and print to stdout.
@@ -3470,7 +3470,7 @@ Methods
|
-
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
+
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -3490,7 +3490,7 @@ Methods
|
-
Encode the data.
+
Encode the data.
Parameters | X: dataframe-like
Feature set with shape=(n_samples, n_features).
diff --git a/docs/API/data_cleaning/imputer/index.html b/docs/API/data_cleaning/imputer/index.html
index 11e747b75..bf6ebe64b 100644
--- a/docs/API/data_cleaning/imputer/index.html
+++ b/docs/API/data_cleaning/imputer/index.html
@@ -1212,7 +1212,7 @@
-
- DirectRegressor
+ DirectForecaster
@@ -1254,7 +1254,7 @@
-
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1296,7 +1296,7 @@
-
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3240,7 +3240,7 @@
Imputer
-
class atom.data_cleaning. Imputer(strat_num="drop", strat_cat="drop", max_nan_rows=None, max_nan_cols=None, device="cpu", engine=None, verbose=0, logger=None) [source] Handle missing values in the data.
+
class atom.data_cleaning. Imputer(strat_num="drop", strat_cat="drop", max_nan_rows=None, max_nan_cols=None, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None) [source] Handle missing values in the data.
Impute or remove missing values according to the selected strategy.
Also removes rows and columns with too many missing values. Use
the missing attribute to customize what are considered "missing
@@ -3275,17 +3275,16 @@ Imputer
follows the SYCL_DEVICE_FILTER filter selector, e.g.
device="gpu" to use the GPU. Read more in the
user guide.
- engine: dict or None, default=None engine: dict, default={"data": "numpy", "estimator": "sklearn"}
Execution engine to use for data and
estimators. The value should be a
dictionary with keys data and/or estimator , with their
-corresponding choice as values. If None, the default options
-are selected. Choose from:
+corresponding choice as values. Choose from:
-
"data":
-- "numpy" (default)
+- "numpy"
- "pyarrow"
- "modin"
@@ -3293,7 +3292,7 @@ Imputer
-
"estimator":
-- "sklearn" (default)
+- "sklearn"
- "cuml"
@@ -3379,7 +3378,7 @@ Methods
fit | Fit to data. | fit_transform | Fit to data, then transform it. | get_metadata_routing | Get metadata routing of this object. | get_params | Get parameters for this estimator. | inverse_transform | Does nothing. | log | Print message and save to log file. | save | Save the instance to a pickle file. | set_params | Set the parameters of this estimator. | transform | Impute the missing values. |
|
-
Fit to data.
+
Fit to data.
Parameters | X: dataframe-like
Feature set with shape=(n_samples, n_features).
@@ -3390,7 +3389,7 @@ Methods
|
-
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
+
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3434,7 +3433,7 @@ Methods
|
-
method inverse_transform(X=None, y=None)
[source] Does nothing.
+
method inverse_transform(X=None, y=None)
[source] Does nothing.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3458,7 +3457,7 @@ Methods
|
-
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
+
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
Parameters | msg: int, float or str
Message to save to the logger and print to stdout.
@@ -3470,7 +3469,7 @@ Methods
|
-
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
+
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -3490,7 +3489,7 @@ Methods
|
-
Impute the missing values.
+
Impute the missing values.
Note that leaving y=None can lead to inconsistencies in
data length between X and y if rows are dropped during
the transformation.
diff --git a/docs/API/data_cleaning/normalizer/index.html b/docs/API/data_cleaning/normalizer/index.html
index ba0a4313d..880c285ea 100644
--- a/docs/API/data_cleaning/normalizer/index.html
+++ b/docs/API/data_cleaning/normalizer/index.html
@@ -1212,7 +1212,7 @@
-
- DirectRegressor
+ DirectForecaster
@@ -1254,7 +1254,7 @@
-
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1296,7 +1296,7 @@
-
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3240,7 +3240,7 @@
Normalizer
-
class atom.data_cleaning.
Normalizer(strategy="yeojohnson", device="cpu", engine=None, verbose=0, logger=None, random_state=None, **kwargs)
[source] Transform the data to follow a Normal/Gaussian distribution.
+
class atom.data_cleaning.
Normalizer(strategy="yeojohnson", device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None, **kwargs)
[source] Transform the data to follow a Normal/Gaussian distribution.
This transformation is useful for modeling issues related to
heteroscedasticity (non-constant variance), or other situations
where normality is desired. Missing values are disregarded in
@@ -3271,17 +3271,16 @@
Normalizer
follows the SYCL_DEVICE_FILTER filter selector, e.g.
device="gpu"
to use the GPU. Read more in the
user guide.
-engine: dict or None, default=None
engine: dict, default={"data": "numpy", "estimator": "sklearn"}
Execution engine to use for
data and
estimators. The value should be a
dictionary with keys
data
and/or
estimator
, with their
-corresponding choice as values. If None, the default options
-are selected. Choose from:
+corresponding choice as values. Choose from:
-
"data":
-- "numpy" (default)
+- "numpy"
- "pyarrow"
- "modin"
@@ -3289,7 +3288,7 @@ Normalizer
-
"estimator":
-- "sklearn" (default)
+- "sklearn"
- "cuml"
@@ -3368,7 +3367,7 @@ Methods
fit | Fit to data. |
fit_transform | Fit to data, then transform it. |
get_metadata_routing | Get metadata routing of this object. |
get_params | Get parameters for this estimator. |
inverse_transform | Apply the inverse transformation to the data. |
log | Print message and save to log file. |
save | Save the instance to a pickle file. |
set_params | Set the parameters of this estimator. |
transform | Apply the transformations to the data. |
-
Fit to data.
+
Fit to data.
Parameters | X: dataframe-like
Feature set with shape=(n_samples, n_features).
@@ -3379,7 +3378,7 @@ Methods
|
-
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
+
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3423,7 +3422,7 @@ Methods
|
-
method inverse_transform(X, y=None)
[source] Apply the inverse transformation to the data.
+
method inverse_transform(X, y=None)
[source] Apply the inverse transformation to the data.
Parameters | X: dataframe-like
Feature set with shape=(n_samples, n_features).
@@ -3434,7 +3433,7 @@ Methods
|
-
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
+
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
Parameters | msg: int, float or str
Message to save to the logger and print to stdout.
@@ -3446,7 +3445,7 @@ Methods
|
-
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
+
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -3466,7 +3465,7 @@ Methods
|
-
Apply the transformations to the data.
+
Apply the transformations to the data.
Parameters | X: dataframe-like
Feature set with shape=(n_samples, n_features).
diff --git a/docs/API/data_cleaning/pruner/index.html b/docs/API/data_cleaning/pruner/index.html
index 3f2b974f5..60dd3337c 100644
--- a/docs/API/data_cleaning/pruner/index.html
+++ b/docs/API/data_cleaning/pruner/index.html
@@ -1212,7 +1212,7 @@
-
- DirectRegressor
+ DirectForecaster
@@ -1254,7 +1254,7 @@
-
- SuccessiveHalvingRegressor
+ SuccessiveHalvingForecaster
@@ -1296,7 +1296,7 @@
-
- TrainSizingRegressor
+ TrainSizingForecaster
@@ -3240,7 +3240,7 @@
Pruner
-
class atom.data_cleaning. Pruner(strategy="zscore", method="drop", max_sigma=3, include_target=False, device="cpu", engine=None, verbose=0, logger=None, **kwargs) [source] Prune outliers from the data.
+
class atom.data_cleaning. Pruner(strategy="zscore", method="drop", max_sigma=3, include_target=False, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs) [source] Prune outliers from the data.
Replace or remove outliers. The definition of outlier depends
on the selected strategy and can greatly differ from one another.
Ignores categorical columns.
@@ -3286,17 +3286,16 @@ Pruner
follows the SYCL_DEVICE_FILTER filter selector, e.g.
device="gpu" to use the GPU. Read more in the
user guide.
- engine: dict or None, default=None engine: dict, default={"data": "numpy", "estimator": "sklearn"}
Execution engine to use for data and
estimators. The value should be a
dictionary with keys data and/or estimator , with their
-corresponding choice as values. If None, the default options
-are selected. Choose from:
+corresponding choice as values. Choose from:
-
"data":
-- "numpy" (default)
+- "numpy"
- "pyarrow"
- "modin"
@@ -3304,7 +3303,7 @@ Pruner
-
"estimator":
-- "sklearn" (default)
+- "sklearn"
- "sklearnex"
- "cuml"
@@ -3381,7 +3380,7 @@ Methods
fit | Does nothing. | fit_transform | Fit to data, then transform it. | get_metadata_routing | Get metadata routing of this object. | get_params | Get parameters for this estimator. | inverse_transform | Does nothing. | log | Print message and save to log file. | save | Save the instance to a pickle file. | set_params | Set the parameters of this estimator. | transform | Apply the outlier strategy on the data. |
|
-
method fit(X=None, y=None, **fit_params)
[source] Does nothing.
+
method fit(X=None, y=None, **fit_params)
[source] Does nothing.
Implemented for continuity of the API.
Parameters | X: dataframe-like or None, default=None
@@ -3406,7 +3405,7 @@ Methods
|
-
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
+
method fit_transform(X=None, y=None, **fit_params)
[source] Fit to data, then transform it.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3450,7 +3449,7 @@ Methods
|
-
method inverse_transform(X=None, y=None)
[source] Does nothing.
+
method inverse_transform(X=None, y=None)
[source] Does nothing.
Parameters | X: dataframe-like or None, default=None
Feature set with shape=(n_samples, n_features). If None,
@@ -3474,7 +3473,7 @@ Methods
|
-
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
+
method log(msg, level=0, severity="info")
[source] Print message and save to log file.
Parameters | msg: int, float or str
Message to save to the logger and print to stdout.
@@ -3486,7 +3485,7 @@ Methods
|
-
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
+
method save(filename="auto", save_data=True)
[source] Save the instance to a pickle file.
Parameters | filename: str, default="auto"
Name of the file. Use "auto" for automatic naming.
@@ -3506,7 +3505,7 @@ Methods
|
-
Apply the outlier strategy on the data.
+
Apply the outlier strategy on the data.
Parameters | X: dataframe-like
Feature set with shape=(n_samples, n_features).
@@ -3688,4 +3687,4 @@ Methods
|