diff --git a/README.md b/README.md index 6cc7a7d11..469065610 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ **General Information** | | --- | --- -**Repository** | [![Project Status: Active](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![Conda Recipe](https://img.shields.io/badge/recipe-atom--ml-green.svg)](https://anaconda.org/conda-forge/atom-ml) [![License: MIT](https://img.shields.io/github/license/tvdboom/ATOM)](https://opensource.org/licenses/MIT) [![Downloads](https://pepy.tech/badge/atom-ml)](https://pepy.tech/project/atom-ml) +**Repository** | [![Project Status: Active](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![Conda Recipe](https://img.shields.io/badge/recipe-atom--ml-green.svg)](https://anaconda.org/conda-forge/atom-ml) [![License: MIT](https://img.shields.io/github/license/tvdboom/ATOM)](https://opensource.org/licenses/MIT) [![Downloads](https://static.pepy.tech/badge/atom-ml)](https://pepy.tech/project/atom-ml) **Release** | [![pdm-managed](https://img.shields.io/badge/pdm-managed-blueviolet)](https://pdm.fming.dev) [![PyPI version](https://img.shields.io/pypi/v/atom-ml)](https://pypi.org/project/atom-ml/) [![Conda Version](https://img.shields.io/conda/vn/conda-forge/atom-ml.svg)](https://anaconda.org/conda-forge/atom-ml) [![DOI](https://zenodo.org/badge/195069958.svg)](https://zenodo.org/badge/latestdoi/195069958) **Compatibility** | [![Python 3.8\|3.9\|3.10\|3.11](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue?logo=python)](https://www.python.org) [![Conda Platforms](https://img.shields.io/conda/pn/conda-forge/atom-ml.svg)](https://anaconda.org/conda-forge/atom-ml) **Build status** | [![Build Status](https://github.com/tvdboom/ATOM/workflows/ATOM/badge.svg)](https://github.com/tvdboom/ATOM/actions) [![Azure Pipelines](https://dev.azure.com/conda-forge/feedstock-builds/_apis/build/status/atom-ml-feedstock?branchName=master)](https://dev.azure.com/conda-forge/feedstock-builds/_build/latest?definitionId=10822&branchName=master) [![codecov](https://codecov.io/gh/tvdboom/ATOM/branch/master/graph/badge.svg)](https://codecov.io/gh/tvdboom/ATOM) diff --git a/atom/api.py b/atom/api.py index 8eea06ddf..2cbba7db3 100644 --- a/atom/api.py +++ b/atom/api.py @@ -18,8 +18,8 @@ from atom.atom import ATOM from atom.basetransformer import BaseTransformer from atom.utils.types import ( - BACKEND, BOOL, ENGINE, GOAL, INDEX_SELECTOR, INT, PREDICTOR, SCALAR, - TARGET, + BACKEND, BOOL, ENGINE, INDEX_SELECTOR, INT, PREDICTOR, SCALAR, + TARGET, WARNINGS, ) @@ -160,7 +160,6 @@ class ATOMClassifier(BaseTransformer, ATOM): y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to X. - - If None: y is ignored. - If int: Position of the target column in X. - If str: Name of the target column in X. - If sequence: Target array with shape=(n_samples,) or @@ -336,7 +335,7 @@ def __init__( engine: ENGINE = {"data": "numpy", "estimator": "sklearn"}, backend: BACKEND = "loky", verbose: Literal[0, 1, 2] = 0, - warnings: BOOL | str = False, + warnings: BOOL | WARNINGS = False, logger: str | Logger | None = None, experiment: str | None = None, random_state: INT | None = None, @@ -353,7 +352,7 @@ def __init__( random_state=random_state, ) - self.goal: GOAL = "class" + self.goal = "class" ATOM.__init__( self, arrays=arrays, @@ -555,7 +554,7 @@ def __init__( engine: ENGINE = {"data": "numpy", "estimator": "sklearn"}, backend: BACKEND = "loky", verbose: Literal[0, 1, 2] = 0, - warnings: BOOL | str = False, + warnings: BOOL | WARNINGS = False, logger: str | Logger | None = None, experiment: str | None = None, random_state: INT | None = None, @@ -572,7 +571,7 @@ def __init__( random_state=random_state, ) - self.goal: GOAL = "fc" + self.goal = "fc" ATOM.__init__( self, arrays=arrays, @@ -790,7 +789,7 @@ def __init__( engine: ENGINE = {"data": "numpy", "estimator": "sklearn"}, backend: BACKEND = "loky", verbose: Literal[0, 1, 2] = 0, - warnings: BOOL | str = False, + warnings: BOOL | WARNINGS = False, logger: str | Logger | None = None, experiment: str | None = None, random_state: INT | None = None, @@ -807,7 +806,7 @@ def __init__( random_state=random_state, ) - self.goal: GOAL = "reg" + self.goal = "reg" ATOM.__init__( self, arrays=arrays, diff --git a/atom/atom.py b/atom/atom.py index 817f51fb8..73127f3e9 100644 --- a/atom/atom.py +++ b/atom/atom.py @@ -36,9 +36,7 @@ ) from atom.models import MODELS from atom.nlp import TextCleaner, TextNormalizer, Tokenizer, Vectorizer -from atom.plots import ( - DataPlot, FeatureSelectorPlot, HTPlot, PredictionPlot, ShapPlot, -) +from atom.plots import ATOMPlot from atom.training import ( DirectClassifier, DirectForecaster, DirectRegressor, SuccessiveHalvingClassifier, SuccessiveHalvingForecaster, @@ -47,9 +45,10 @@ ) from atom.utils.constants import MISSING_VALUES, __version__ from atom.utils.types import ( - BOOL, DATAFRAME, DATASET, FEATURES, INDEX, INDEX_SELECTOR, INT, - METRIC_SELECTOR, PANDAS, PREDICTOR, RUNNER, SCALAR, SEQUENCE, SERIES, - SLICE, TARGET, TRANSFORMER, TS_INDEX_TYPES, + BOOL, DATAFRAME, DATASET, DISCRETIZER_STRATS, ESTIMATOR, FEATURES, INDEX, + INDEX_SELECTOR, INT, METRIC_SELECTOR, PANDAS, PREDICTOR, PRUNER_STRATS, + RUNNER, SCALAR, SCALER_STRATS, SEQUENCE, SERIES, SLICE, STRAT_NUM, TARGET, + TRANSFORMER, TS_INDEX_TYPES, ) from atom.utils.utils import ( ClassMap, DataConfig, check_dependency, check_is_fitted, check_scaling, @@ -60,7 +59,7 @@ @typechecked -class ATOM(BaseRunner, FeatureSelectorPlot, DataPlot, HTPlot, PredictionPlot, ShapPlot): +class ATOM(BaseRunner, ATOMPlot): """ATOM base class. The ATOM class is a convenient wrapper for all data cleaning, @@ -160,7 +159,7 @@ def __repr__(self) -> str: return out - def __iter__(self) -> TRANSFORMER: + def __iter__(self) -> TRANSFORMER | None: yield from self.pipeline.values # Utility properties =========================================== >> @@ -545,7 +544,7 @@ def inverse_transform( y: TARGET | None = None, *, verbose: INT | None = None, - ) -> PANDAS | tuple[DATAFRAME, SERIES]: + ) -> PANDAS | tuple[DATAFRAME, PANDAS]: """Inversely transform new data through the pipeline. Transformers that are only applied on the training set are @@ -898,7 +897,7 @@ def get_data(new_t: str) -> SERIES: get_data(r[0]) for r in t if r[1] <= column.min() and r[2] >= column.max() ) - if self.engine["data"] == "pyarrow": + if self.engine.get("data") == "pyarrow": self.branch.dataset = self.branch.dataset.astype( {name: to_pyarrow(col) for name, col in self.branch._data.items()} ) @@ -986,7 +985,7 @@ def transform( y: TARGET | None = None, *, verbose: INT | None = None, - ) -> PANDAS | tuple[DATAFRAME, SERIES]: + ) -> PANDAS | tuple[DATAFRAME, PANDAS]: """Transform new data through the pipeline. Transformers that are only applied on the training set are @@ -1068,7 +1067,7 @@ def _add_transformer( self, transformer: TRANSFORMER, columns: SLICE | None = None, - train_only: bool = False, + train_only: BOOL = False, **fit_params, ): """Add a transformer to the pipeline. @@ -1106,9 +1105,6 @@ def _add_transformer( "new branch to continue the pipeline." ) - if not hasattr(transformer, "transform"): - raise AttributeError("Added transformers should have a transform method!") - # Add BaseTransformer params to the estimator if left to default transformer = self._inherit(transformer) @@ -1160,7 +1156,7 @@ def add( transformer: TRANSFORMER, *, columns: SLICE | None = None, - train_only: bool = False, + train_only: BOOL = False, **fit_params, ): """Add a transformer to the pipeline. @@ -1249,9 +1245,8 @@ def apply( ): """Apply a function to the dataset. - The function should have signature `func(dataset, **kw_args) -> - dataset`. This method is useful for stateless transformations - such as taking the log, doing custom scaling, etc... + This method is useful for stateless transformations such as + taking the log, doing custom scaling, etc... !!! note This approach is preferred over changing the dataset directly @@ -1265,7 +1260,8 @@ def apply( Parameters ---------- func: callable - Function to apply. + Function to apply with signature `func(dataset, **kw_args) -> + dataset`. inverse_func: callable or None, default=None Inverse function of `func`. If None, the inverse_transform @@ -1336,13 +1332,13 @@ def balance(self, strategy: str = "adasyn", **kwargs): def clean( self, *, - convert_dtypes: bool = True, + convert_dtypes: BOOL = True, drop_dtypes: str | SEQUENCE | None = None, drop_chars: str | None = None, - strip_categorical: bool = True, - drop_duplicates: bool = False, - drop_missing_target: bool = True, - encode_target: bool = True, + strip_categorical: BOOL = True, + drop_duplicates: BOOL = False, + drop_missing_target: BOOL = True, + encode_target: BOOL = True, **kwargs, ): """Applies standard data cleaning steps on the dataset. @@ -1382,7 +1378,7 @@ def clean( @composed(crash, method_to_log) def discretize( self, - strategy: str = "quantile", + strategy: DISCRETIZER_STRATS = "quantile", *, bins: INT | SEQUENCE | dict = 5, labels: SEQUENCE | dict | None = None, @@ -1467,7 +1463,7 @@ def encode( @composed(crash, method_to_log) def impute( self, - strat_num: SCALAR | Literal["drop", "mean", "knn", "most_frequent"] = "drop", + strat_num: STRAT_NUM = "drop", strat_cat: Literal["drop", "most_frequent"] | str = "drop", *, max_nan_rows: SCALAR | None = None, @@ -1539,11 +1535,11 @@ def normalize( @composed(crash, method_to_log) def prune( self, - strategy: str | SEQUENCE = "zscore", + strategy: PRUNER_STRATS | SEQUENCE = "zscore", *, method: SCALAR | Literal["drop", "minmax"] = "drop", max_sigma: SCALAR = 3, - include_target: bool = False, + include_target: BOOL = False, **kwargs, ): """Prune outliers from the training set. @@ -1581,7 +1577,12 @@ def prune( setattr(self.branch, strat.lower(), getattr(pruner, strat.lower())) @composed(crash, method_to_log) - def scale(self, strategy: str = "standard", include_binary: bool = False, **kwargs): + def scale( + self, + strategy: SCALER_STRATS = "standard", + include_binary: BOOL = False, + **kwargs, + ): """Scale the data. Apply one of sklearn's scalers. Categorical columns are ignored. @@ -1611,19 +1612,19 @@ def scale(self, strategy: str = "standard", include_binary: bool = False, **kwar def textclean( self, *, - decode: bool = True, - lower_case: bool = True, - drop_email: bool = True, + decode: BOOL = True, + lower_case: BOOL = True, + drop_email: BOOL = True, regex_email: str | None = None, - drop_url: bool = True, + drop_url: BOOL = True, regex_url: str | None = None, - drop_html: bool = True, + drop_html: BOOL = True, regex_html: str | None = None, - drop_emoji: bool = True, + drop_emoji: BOOL = True, regex_emoji: str | None = None, - drop_number: bool = True, + drop_number: BOOL = True, regex_number: str | None = None, - drop_punctuation: bool = True, + drop_punctuation: BOOL = True, **kwargs, ): """Applies standard text cleaning to the corpus. @@ -1664,10 +1665,10 @@ def textclean( def textnormalize( self, *, - stopwords: bool | str = True, + stopwords: BOOL | str = True, custom_stopwords: SEQUENCE | None = None, - stem: bool | str = False, - lemmatize: bool = True, + stem: BOOL | str = False, + lemmatize: BOOL = True, **kwargs, ): """Normalize the corpus. @@ -1727,7 +1728,13 @@ def tokenize( self.branch.quadgrams = tokenizer.quadgrams @composed(crash, method_to_log) - def vectorize(self, strategy: str = "bow", *, return_sparse: bool = True, **kwargs): + def vectorize( + self, + strategy: Literal["bow", "tfidf", "hashing"] = "bow", + *, + return_sparse: BOOL = True, + **kwargs, + ): """Vectorize the corpus. Transform the corpus into meaningful vectors of numbers. The @@ -1766,7 +1773,7 @@ def feature_extraction( fmt: str | SEQUENCE | None = None, *, encoding_type: str = "ordinal", - drop_columns: bool = True, + drop_columns: BOOL = True, **kwargs, ): """Extract features from datetime columns. @@ -1831,7 +1838,7 @@ def feature_grouping( group: dict[str, str | SEQUENCE], *, operators: str | SEQUENCE | None = None, - drop_columns: bool = True, + drop_columns: BOOL = True, **kwargs, ): """Extract statistics from similar features. @@ -1862,7 +1869,7 @@ def feature_selection( self, strategy: str | None = None, *, - solver: str | Callable | None = None, + solver: str | ESTIMATOR | None = None, n_features: SCALAR | None = None, min_repeated: SCALAR | None = 2, max_repeated: SCALAR | None = 1.0, @@ -2005,7 +2012,7 @@ def run( n_trials: INT | dict | SEQUENCE = 0, ht_params: dict | None = None, n_bootstrap: INT | SEQUENCE = 0, - parallel: bool = False, + parallel: BOOL = False, errors: Literal["raise", "skip", "keep"] = "skip", **kwargs, ): @@ -2061,7 +2068,7 @@ def successive_halving( n_trials: INT | dict | SEQUENCE = 0, ht_params: dict | None = None, n_bootstrap: INT | dict | SEQUENCE = 0, - parallel: bool = False, + parallel: BOOL = False, errors: Literal["raise", "skip", "keep"] = "skip", **kwargs, ): @@ -2124,7 +2131,7 @@ def train_sizing( n_trials: INT | dict | SEQUENCE = 0, ht_params: dict | None = None, n_bootstrap: INT | dict | SEQUENCE = 0, - parallel: bool = False, + parallel: BOOL = False, errors: Literal["raise", "skip", "keep"] = "skip", **kwargs, ): diff --git a/atom/basemodel.py b/atom/basemodel.py index 1060c7421..723c128da 100644 --- a/atom/basemodel.py +++ b/atom/basemodel.py @@ -17,7 +17,7 @@ from logging import Logger from typing import Any, Callable, Literal from unittest.mock import patch - +from typeguard import TypeCheckError import dill as pickle import mlflow import numpy as np @@ -56,12 +56,12 @@ from atom.basetransformer import BaseTransformer from atom.data_cleaning import Scaler from atom.pipeline import Pipeline -from atom.plots import HTPlot, PredictionPlot, ShapPlot +from atom.plots import RunnerPlot from atom.utils.constants import DF_ATTRS from atom.utils.types import ( BOOL, BRANCH, DATAFRAME, DATAFRAME_TYPES, ENGINE, FEATURES, FLOAT, - FLOAT_TYPES, GOAL, INDEX, INT, INT_TYPES, METRIC_SELECTOR, PANDAS, - PREDICTOR, SCALAR, SCORER, SEQUENCE, SERIES, SLICE, TARGET, + FLOAT_TYPES, INDEX, INT, INT_TYPES, METRIC_SELECTOR, PANDAS, + PREDICTOR, SCALAR, SCORER, SEQUENCE, SERIES, SLICE, TARGET, WARNINGS, ) from atom.utils.utils import ( ClassMap, CustomDict, DataConfig, PlotCallback, ShapExplanation, @@ -75,7 +75,7 @@ @typechecked -class BaseModel(BaseTransformer, BaseTracker, HTPlot, PredictionPlot, ShapPlot): +class BaseModel(BaseTransformer, BaseTracker, RunnerPlot): """Base class for all models. Parameters @@ -174,7 +174,7 @@ class BaseModel(BaseTransformer, BaseTracker, HTPlot, PredictionPlot, ShapPlot): def __init__( self, name: str | None = None, - goal: GOAL = "class", + goal: Literal["class", "reg", "fc"] = "class", config: DataConfig | None = None, og: BRANCH | None = None, branch: BRANCH | None = None, @@ -184,7 +184,7 @@ def __init__( engine: ENGINE = {"data": "numpy", "estimator": "sklearn"}, backend: str = "loky", verbose: Literal[0, 1, 2] = 0, - warnings: BOOL | str = False, + warnings: BOOL | WARNINGS = False, logger: str | Logger | None = None, experiment: str | None = None, random_state: INT | None = None, @@ -276,16 +276,12 @@ def _fullname(self) -> str: """Return the model's class name.""" return self.__class__.__name__ - @property - def _gpu(self) -> BOOL: - """Return whether the model uses a GPU implementation.""" - return "gpu" in self.device.lower() - @property def _est_class(self) -> PREDICTOR: """Return the estimator's class (not instance).""" try: - module = import_module(f"{self.engine['estimator']}.{self._module}") + engine = self.engine.get("estimator", "sklearn") + module = import_module(f"{engine}.{self._module}") cls = self._estimators.get(self.goal, self._estimators.get("reg")) except (ModuleNotFoundError, AttributeError): if "sklearn" in self.supports_engines: @@ -442,9 +438,9 @@ def _get_est(self, **params) -> PREDICTOR: def _fit_estimator( self, estimator: PREDICTOR, - data: tuple[DATAFRAME, SERIES], + data: tuple[DATAFRAME, PANDAS], est_params_fit: dict, - validation: tuple[DATAFRAME, SERIES] | None = None, + validation: tuple[DATAFRAME, PANDAS] | None = None, trial: Trial | None = None, ) -> PREDICTOR: """Fit the estimator and perform in-training validation. @@ -581,7 +577,7 @@ def _final_output(self) -> str: if (1.2 if score_train < 0 else 0.8) * score_train > score_test: out += " ~" - except AttributeError: # Fails when model failed but errors="keep" + except TypeCheckError: # Fails when model failed but errors="keep" out = "FAIL" return out @@ -692,7 +688,7 @@ def _score_from_pred( y_true: PANDAS, y_pred: PANDAS, **kwargs, - ) -> FLOAT: + ) -> SCALAR: """Calculate the metric score from predicted values. Since sklearn metrics don't support multiclass-multioutput @@ -715,7 +711,7 @@ def _score_from_pred( Returns ------- - float + int or float Calculated score. """ @@ -740,7 +736,7 @@ def _get_score( dataset: str, threshold: tuple[FLOAT] | None = None, sample_weight: tuple | None = None, - ) -> FLOAT: + ) -> SCALAR: """Calculate a metric score using the prediction attributes. The method results are cached to avoid recalculation of the @@ -771,7 +767,7 @@ def _get_score( Returns ------- - float + int or float Metric score on the selected data set. """ @@ -886,7 +882,7 @@ def fit_model( y_val = self.og.y_train.iloc[val_idx] # Transform subsets if there is a pipeline - if len(pl := self.export_pipeline(verbose=0)[:-1]) > 0: + if len(pl := export_pipeline(self.pipeline, verbose=0)) > 0: X_subtrain, y_subtrain = pl.fit_transform(X_subtrain, y_subtrain) X_val, y_val = pl.transform(X_val, y_val) @@ -1401,17 +1397,17 @@ def evals(self) -> CustomDict: return self._evals @property - def score_train(self) -> FLOAT | list[FLOAT]: + def score_train(self) -> SCALAR | list[SCALAR]: """Metric score on the training set.""" return flt([self._get_score(m, "train") for m in self._metric]) @property - def score_test(self) -> FLOAT | list[FLOAT]: + def score_test(self) -> SCALAR | list[SCALAR]: """Metric score on the test set.""" return flt([self._get_score(m, "test") for m in self._metric]) @property - def score_holdout(self) -> FLOAT | list[FLOAT]: + def score_holdout(self) -> SCALAR | list[SCALAR]: """Metric score on the holdout set.""" return flt([self._get_score(m, "holdout") for m in self._metric]) @@ -1433,7 +1429,7 @@ def bootstrap(self) -> pd.DataFrame | None: return self._bootstrap @property - def score_bootstrap(self) -> FLOAT | list[FLOAT] | None: + def score_bootstrap(self) -> SCALAR | list[SCALAR] | None: """Mean metric score on the bootstrapped samples.""" if self.bootstrap is not None: return flt(self.bootstrap.mean().tolist()) @@ -2141,7 +2137,7 @@ def inverse_transform( y: TARGET | None = None, *, verbose: INT | None = None, - ) -> PANDAS | tuple[DATAFRAME, SERIES]: + ) -> PANDAS | tuple[DATAFRAME, PANDAS]: """Inversely transform new data through the pipeline. Transformers that are only applied on the training set are @@ -2200,7 +2196,7 @@ def register( self, name: str | None = None, stage: str = "None", - archive_existing_versions: bool = False, + archive_existing_versions: BOOL = False, ): """Register the model in [mlflow's model registry][registry]. @@ -2340,7 +2336,7 @@ def transform( y: TARGET | None = None, *, verbose: INT | None = None, - ) -> PANDAS | tuple[DATAFRAME, SERIES]: + ) -> PANDAS | tuple[DATAFRAME, PANDAS]: """Transform new data through the pipeline. Transformers that are only applied on the training set are @@ -3490,7 +3486,7 @@ def predict_proba( self, fh: int | SEQUENCE | ForecastingHorizon, X: FEATURES | None = None, - marginal: bool = True, + marginal: BOOL = True, verbose: INT | None = None, ) -> Normal: """Get probabilistic forecasts on new data or existing rows. @@ -3624,7 +3620,7 @@ def predict_var( self, fh: int | SEQUENCE | ForecastingHorizon, X: FEATURES | None = None, - cov: bool = False, + cov: BOOL = False, verbose: INT | None = None, ) -> DATAFRAME: """Get probabilistic forecasts on new data or existing rows. diff --git a/atom/baserunner.py b/atom/baserunner.py index 758553df0..7d66f424e 100644 --- a/atom/baserunner.py +++ b/atom/baserunner.py @@ -97,13 +97,10 @@ def __len__(self) -> int: return len(self.dataset) def __contains__(self, item: str) -> BOOL: - if self.dataset is None: - return False - else: - return item in self.dataset + return item in self.dataset def __getitem__(self, item: INT | str | list) -> Any: - if self.dataset is None: + if self.dataset.empty: raise RuntimeError( "This instance has no dataset annexed to it. " "Use the run method before calling __getitem__." @@ -122,18 +119,13 @@ def __getitem__(self, item: INT | str | list) -> Any: f"{self.__class__.__name__} object has no " f"branch, model or column called {item}." ) - elif isinstance(item, list): - return self.dataset[item] # Get subset of dataset else: - raise TypeError( - f"{self.__class__.__name__} is only " - "subscriptable with types int, str or list." - ) + return self.dataset[item] # Get subset of dataset # Utility properties =========================================== >> @property - def og(self) -> Branch: + def og(self) -> BRANCH: """Branch containing the original dataset. This branch contains the data prior to any transformations. @@ -144,7 +136,7 @@ def og(self) -> Branch: return self._og or self.branch @property - def branch(self) -> Branch: + def branch(self) -> BRANCH: """Current active branch. Use the property's `@setter` to change the branch or to create diff --git a/atom/basetracker.py b/atom/basetracker.py index 6a919a637..8cb73fe9c 100644 --- a/atom/basetracker.py +++ b/atom/basetracker.py @@ -7,6 +7,8 @@ """ +from __future__ import annotations + from dataclasses import dataclass from typeguard import typechecked diff --git a/atom/basetrainer.py b/atom/basetrainer.py index 6b26fc6f9..0c94f89d3 100644 --- a/atom/basetrainer.py +++ b/atom/basetrainer.py @@ -13,7 +13,7 @@ import traceback from datetime import datetime as dt from typing import Any - +from typeguard import TypeCheckError import joblib import mlflow import numpy as np @@ -28,7 +28,7 @@ from atom.branch import Branch from atom.data_cleaning import BaseTransformer from atom.models import MODELS, CustomModel -from atom.plots import HTPlot, PredictionPlot, ShapPlot +from atom.plots import RunnerPlot from atom.utils.types import MODEL, SEQUENCE_TYPES from atom.utils.utils import ( ClassMap, DataConfig, check_dependency, get_best_score, get_custom_scorer, @@ -37,7 +37,7 @@ @typechecked -class BaseTrainer(BaseTransformer, BaseRunner, HTPlot, PredictionPlot, ShapPlot): +class BaseTrainer(BaseTransformer, BaseRunner, RunnerPlot): """Base class for trainers. Implements methods to check the validity of the parameters, @@ -432,7 +432,7 @@ def execute_model(m: MODEL) -> MODEL | None: try: scores.append(get_best_score(model)) - except AttributeError: # Fails when model failed but errors="keep" + except TypeCheckError: # Fails when model failed but errors="keep" scores.append(-np.inf) maxlen = max(maxlen, len(names[-1])) diff --git a/atom/basetransformer.py b/atom/basetransformer.py index edac1266a..c94b0df4a 100644 --- a/atom/basetransformer.py +++ b/atom/basetransformer.py @@ -18,7 +18,7 @@ from importlib.util import find_spec from logging import DEBUG, FileHandler, Formatter, Logger, getLogger from multiprocessing import cpu_count -from typing import Any, Callable +from typing import Any, Callable, Literal import dagshub import dill as pickle @@ -33,8 +33,8 @@ from typeguard import typechecked from atom.utils.types import ( - BOOL, DATAFRAME, DATAFRAME_TYPES, FEATURES, INDEX, INT, INT_TYPES, PANDAS, - PREDICTOR, SCALAR, SEQUENCE, SEQUENCE_TYPES, TARGET, + BACKEND, BOOL, DATAFRAME, DATAFRAME_TYPES, ENGINE, ESTIMATOR, FEATURES, + INT, INT_TYPES, PANDAS, SCALAR, SEQUENCE, SEQUENCE_TYPES, TARGET, WARNINGS, ) from atom.utils.utils import ( bk, composed, crash, get_cols, lst, merge, method_to_log, n_cols, pd, sign, @@ -101,7 +101,8 @@ def n_jobs(self, value: INT): # Final check for negative input if value < 1: raise ValueError( - f"Invalid value for the n_jobs parameter, got {value}.", 1 + "Invalid value for the n_jobs parameter, " + f"got {value}. Value should be >=0.", 1 ) self._n_jobs = value @@ -118,92 +119,55 @@ def device(self, value: str): os.environ["CUDA_VISIBLE_DEVICES"] = str(self._device_id) @property - def engine(self) -> dict: + def engine(self) -> ENGINE: """Execution engine for estimators.""" return self._engine @engine.setter - def engine(self, value: dict | None): - if not value: - value = {"data": "numpy", "estimator": "sklearn"} - elif "data" not in value and "estimator" not in value: - raise ValueError( - f"Invalid value for the engine parameter, got {value}. " - "The value should be a dict with keys 'data' and/or 'estimator'." + def engine(self, value: ENGINE): + if value.get("data") == "modin" and not ray.is_initialized(): + ray.init( + runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}}, + log_to_driver=False, ) - if data := value.get("data"): - if data.lower() == "modin": - if not ray.is_initialized(): - ray.init( - runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}}, - log_to_driver=False, - ) - elif data.lower() not in ("numpy", "pyarrow"): - raise ValueError( - "Invalid value for the data key of the engine parameter, " - f"got {data}. Choose from: numpy, pyarrow, modin." - ) - else: - value["data"] = "numpy" - # Update env variable to use for PandasModin in utils.py - os.environ["ATOM_DATA_ENGINE"] = value["data"].lower() - - if models := value.get("estimator"): - device = self.device.lower() - - if models.lower() == "sklearnex": - if not find_spec("sklearnex"): - raise ModuleNotFoundError( - "Failed to import scikit-learn-intelex. The library is " - "not installed. Note that the library only supports CPUs " - "with a x86 architecture." - ) - else: - import sklearnex - sklearnex.set_config(device if "gpu" in device else "auto") - elif models.lower() == "cuml": - if not find_spec("cuml"): - raise ModuleNotFoundError( - "Failed to import cuml. Package is not installed. Refer " - "to: https://rapids.ai/start.html#install." - ) - else: - from cuml.common.device_selection import ( - set_global_device_type, - ) - set_global_device_type("gpu" if "gpu" in device else "cpu") - - # See https://github.com/rapidsai/cuml/issues/5564 - from cuml.internals.memory_utils import ( - set_global_output_type, - ) - set_global_output_type("numpy") - - elif models.lower() != "sklearn": - raise ValueError( - "Invalid value for the models key of the engine parameter, " - f"got {models}. Choose from: sklearn, sklearnex, cuml." + os.environ["ATOM_DATA_ENGINE"] = value.get("data", "numpy") + + if value.get("estimator") == "sklearnex": + if not find_spec("sklearnex"): + raise ModuleNotFoundError( + "Failed to import scikit-learn-intelex. The library is " + "not installed. Note that the library only supports CPUs " + "with a x86 architecture." ) - else: - value["estimator"] = "sklearn" + else: + import sklearnex + sklearnex.set_config(self.device.lower() if self._gpu else "auto") + elif value.get("estimator") == "cuml": + if not find_spec("cuml"): + raise ModuleNotFoundError( + "Failed to import cuml. Package is not installed. Refer " + "to: https://rapids.ai/start.html#install." + ) + else: + from cuml.common.device_selection import set_global_device_type + set_global_device_type("gpu" if self._gpu else "cpu") + + # See https://github.com/rapidsai/cuml/issues/5564 + from cuml.internals.memory_utils import set_global_output_type + set_global_output_type("numpy") self._engine = value @property - def backend(self) -> str: + def backend(self) -> BACKEND: """Parallelization backend.""" return self._backend @backend.setter - def backend(self, value: str): - if value.lower() not in (opts := ("loky", "multiprocessing", "threading", "ray")): - raise ValueError( - f"Invalid value for the backend parameter, got " - f"{value}. Choose from: {', '.join(opts)}." - ) - elif value.lower() == "ray": + def backend(self, value: BACKEND): + if value == "ray": register_ray() # Register ray as joblib backend if not ray.is_initialized(): ray.init(log_to_driver=False) @@ -211,35 +175,24 @@ def backend(self, value: str): self._backend = value @property - def verbose(self) -> INT: + def verbose(self) -> Literal[0, 1, 2]: """Verbosity level of the output.""" return self._verbose @verbose.setter - def verbose(self, value: INT): - if value < 0 or value > 2: - raise ValueError( - "Invalid value for the verbose parameter. Value" - f" should be between 0 and 2, got {value}." - ) + def verbose(self, value: Literal[0, 1, 2]): self._verbose = value @property - def warnings(self) -> str: + def warnings(self) -> WARNINGS: """Whether to show or suppress encountered warnings.""" return self._warnings @warnings.setter - def warnings(self, value: BOOL | str): + def warnings(self, value: BOOL | WARNINGS): if isinstance(value, BOOL): self._warnings = "default" if value else "ignore" else: - options = ("default", "error", "ignore", "always", "module", "once") - if value not in options: - raise ValueError( - "Invalid value for the warnings parameter, got " - f"{value}. Choose from: {', '.join(options)}." - ) self._warnings = value warnings.filterwarnings(self._warnings) # Change the filter in this process @@ -336,7 +289,7 @@ def experiment(self, value: str | None): mlflow.set_experiment(value) @property - def random_state(self) -> INT: + def random_state(self) -> INT | None: """Seed used by the random number generator.""" return self._random_state @@ -351,6 +304,11 @@ def random_state(self, value: INT | None): np.random.seed(value) self._random_state = value + @property + def _gpu(self) -> BOOL: + """Return whether the instance uses a GPU implementation.""" + return "gpu" in self.device.lower() + @property def _device_id(self) -> int: """Which GPU device to use.""" @@ -392,7 +350,7 @@ def _inherit(self, obj: Any) -> Any: return obj - def _get_est_class(self, name: str, module: str) -> PREDICTOR: + def _get_est_class(self, name: str, module: str) -> ESTIMATOR: """Import a class from a module. When the import fails, for example if atom uses sklearnex and @@ -408,12 +366,13 @@ def _get_est_class(self, name: str, module: str) -> PREDICTOR: Returns ------- - Predictor + Estimator Class of the estimator. """ try: - return getattr(import_module(f"{self.engine['estimator']}.{module}"), name) + engine = self.engine.get("estimator", "sklearn") + return getattr(import_module(f"{engine}.{module}"), name) except (ModuleNotFoundError, AttributeError): return getattr(import_module(f"sklearn.{module}"), name) @@ -925,7 +884,7 @@ def _has_data_sets( if self.goal == "fc" and not isinstance(y, (INT, str)): # arrays=() and y=y for forecasting sets = _no_data_sets(*self._prepare_input(y=y)) - elif self.branch._data is None: + elif self.branch._data.empty: raise ValueError( "The data arrays are empty! Provide the data to run the pipeline " "successfully. See the documentation for the allowed formats." @@ -1042,7 +1001,7 @@ def log(self, msg: SCALAR | str, level: INT = 0, severity: str = "info"): getattr(self.logger, severity)(str(text)) @composed(crash, method_to_log) - def save(self, filename: str = "auto", *, save_data: bool = True): + def save(self, filename: str = "auto", *, save_data: BOOL = True): """Save the instance to a pickle file. Parameters diff --git a/atom/branch.py b/atom/branch.py index 423d0ce28..9abf44789 100644 --- a/atom/branch.py +++ b/atom/branch.py @@ -43,8 +43,8 @@ class Branch: name: str Name of the branch. - data: dataframe or None, default=None - Complete dataset. + data: dataframe, default=pd.DataFrame() + Complete dataset. Defaults to an empty frame if not provided. index: list or None, default=None A list containing the number of target columns, the indices of @@ -61,7 +61,7 @@ class Branch: def __init__( self, name: str, - data: DATAFRAME | None = None, + data: DATAFRAME = pd.DataFrame(), index: list[INT, INDEX, INDEX] | None = None, holdout: DATAFRAME | None = None, parent: BRANCH | None = None, @@ -69,7 +69,7 @@ def __init__( self._data = data self._idx = index self._holdout = holdout - self._pipeline = pd.Series(data=[], dtype="object") + self._pipeline = pd.Series(dtype="object") self._mapping = CustomDict() # If a parent branch is provided, transfer its attrs to this one @@ -87,7 +87,7 @@ def __repr__(self) -> str: return f"Branch({self.name})" def __bool__(self): - return self._data is not None + return not self._data.empty @property def name(self) -> str: @@ -172,7 +172,7 @@ def counter(name: str, dim: str) -> str: value = to_pandas( data=value, index=side.index if side_name else None, - name=getattr(under, "name", None) if under_name else None, + name=getattr(under, "name", None) if under_name else "target", columns=getattr(under, "columns", None) if under_name else None, dtype=under.dtypes if under_name else None, ) diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py index 4bcdbca48..59390a36b 100644 --- a/atom/data_cleaning.py +++ b/atom/data_cleaning.py @@ -47,8 +47,9 @@ from atom.basetransformer import BaseTransformer from atom.utils.constants import MISSING_VALUES from atom.utils.types import ( - BOOL, DATAFRAME, DATAFRAME_TYPES, ENGINE, ESTIMATOR, FEATURES, FLOAT, INT, - PANDAS, SCALAR, SEQUENCE, SEQUENCE_TYPES, SERIES_TYPES, TARGET, + BOOL, DATAFRAME, DATAFRAME_TYPES, DISCRETIZER_STRATS, ENGINE, ESTIMATOR, + FEATURES, FLOAT, INT, PANDAS, PRUNER_STRATS, SCALAR, SCALER_STRATS, + SEQUENCE, SEQUENCE_TYPES, SERIES_TYPES, STRAT_NUM, TARGET, ) from atom.utils.utils import ( CustomDict, bk, check_is_fitted, composed, crash, get_cols, it, lst, merge, @@ -1082,7 +1083,7 @@ class Discretizer(BaseEstimator, TransformerMixin, BaseTransformer): def __init__( self, - strategy: str = "quantile", + strategy: DISCRETIZER_STRATS = "quantile", *, bins: INT | SEQUENCE | dict = 5, labels: SEQUENCE | dict | None = None, @@ -1151,12 +1152,6 @@ def get_labels(labels, bins): self._check_n_features(X, reset=True) self._num_cols = list(X.select_dtypes(include="number")) - if self.strategy.lower() not in ("uniform", "quantile", "kmeans", "custom"): - raise ValueError( - f"Invalid value for the strategy parameter, got {self.strategy}. " - "Choose from: uniform, quantile, kmeans, custom." - ) - self.log("Fitting Discretizer...", 1) labels = {} if self.labels is None else self.labels @@ -1173,7 +1168,7 @@ def get_labels(labels, bins): else: bins = self.bins - if self.strategy.lower() != "custom": + if self.strategy != "custom": if isinstance(bins, SEQUENCE_TYPES): try: bins = bins[i] # Fetch the i-th bin for the i-th column @@ -1186,15 +1181,16 @@ def get_labels(labels, bins): estimator = self._get_est_class("KBinsDiscretizer", "preprocessing") - # cuML implementation has no random_state + # cuML implementation has no subsample and random_state kwargs = {} - if "random_state" in sign(estimator): + if "subsample" in sign(estimator): + kwargs["subsample"] = 200000 kwargs["random_state"] = self.random_state self._discretizers[col] = estimator( n_bins=bins, encode="ordinal", - strategy=self.strategy.lower(), + strategy=self.strategy, **kwargs, ).fit(X[[col]]) @@ -1806,7 +1802,7 @@ class Imputer(BaseEstimator, TransformerMixin, BaseTransformer): def __init__( self, - strat_num: SCALAR | Literal["drop", "mean", "knn", "most_frequent"] = "drop", + strat_num: STRAT_NUM = "drop", strat_cat: Literal["drop", "most_frequent"] | str = "drop", *, max_nan_rows: SCALAR | None = None, @@ -1853,12 +1849,6 @@ def fit(self, X: FEATURES, y: TARGET | None = None) -> Imputer: self._num_cols = list(X.select_dtypes(include="number")) # Check input Parameters - strategies = ["drop", "mean", "median", "knn", "most_frequent"] - if isinstance(self.strat_num, str) and self.strat_num.lower() not in strategies: - raise ValueError( - "Unknown strategy for the strat_num parameter, got " - f"{self.strat_num}. Choose from: {', '.join(strategies)}." - ) if self.max_nan_rows: if self.max_nan_rows < 0: raise ValueError( @@ -1902,10 +1892,8 @@ def fit(self, X: FEATURES, y: TARGET | None = None) -> Imputer: self._imputers = {} # Load the imputer class from sklearn or cuml (different modules) - estimator = self._get_est_class( - name="SimpleImputer", - module="preprocessing" if self.engine["estimator"] == "cuml" else "impute", - ) + module = "preprocessing" if self.engine.get("estimator") == "cuml" else "impute" + estimator = self._get_est_class("SimpleImputer", module) # Assign an imputer to each column for name, column in X.items(): @@ -2496,11 +2484,11 @@ class Pruner(BaseEstimator, TransformerMixin, BaseTransformer): def __init__( self, - strategy: str | SEQUENCE = "zscore", + strategy: PRUNER_STRATS | SEQUENCE = "zscore", *, method: SCALAR | Literal["drop", "minmax"] = "drop", max_sigma: SCALAR = 3, - include_target: bool = False, + include_target: BOOL = False, device: str = "cpu", engine: ENGINE = {"data": "numpy", "estimator": "sklearn"}, verbose: Literal[0, 1, 2] = 0, @@ -2800,8 +2788,8 @@ class Scaler(BaseEstimator, TransformerMixin, BaseTransformer): def __init__( self, - strategy: str = "standard", - include_binary: bool = False, + strategy: SCALER_STRATS = "standard", + include_binary: BOOL = False, *, device: str = "cpu", engine: ENGINE = {"data": "numpy", "estimator": "sklearn"}, @@ -2853,14 +2841,8 @@ def fit(self, X: FEATURES, y: TARGET | None = None) -> Scaler: robust="RobustScaler", ) - if self.strategy in strategies: - estimator = self._get_est_class(strategies[self.strategy], "preprocessing") - self._estimator = estimator(**self.kwargs) - else: - raise ValueError( - f"Invalid value for the strategy parameter, got {self.strategy}. " - f"Choose from: {', '.join(strategies)}." - ) + estimator = self._get_est_class(strategies[self.strategy], "preprocessing") + self._estimator = estimator(**self.kwargs) self.log("Fitting Scaler...", 1) self._estimator.fit(X[self._num_cols]) diff --git a/atom/ensembles.py b/atom/ensembles.py index 50cfec4f2..3763bd138 100644 --- a/atom/ensembles.py +++ b/atom/ensembles.py @@ -381,7 +381,7 @@ def fit( X: FEATURES, y: SEQUENCE, sample_weight: SEQUENCE | None = None, - ) -> VotingRegressor: + ) -> StackingClassifier: """Fit the estimators, skipping prefit ones. Parameters diff --git a/atom/feature_engineering.py b/atom/feature_engineering.py index 3c5482582..8f4238032 100644 --- a/atom/feature_engineering.py +++ b/atom/feature_engineering.py @@ -13,7 +13,7 @@ from collections import defaultdict from logging import Logger from random import sample -from typing import Callable, Literal +from typing import Literal import featuretools as ft import joblib @@ -36,10 +36,10 @@ from atom.basetransformer import BaseTransformer from atom.data_cleaning import Scaler, TransformerMixin from atom.models import MODELS -from atom.plots import FeatureSelectorPlot +from atom.plots import FeatureSelectionPlot from atom.utils.types import ( - BOOL, DATAFRAME, ENGINE, FEATURES, FLOAT, INT, INT_TYPES, SCALAR, SEQUENCE, - SEQUENCE_TYPES, SERIES_TYPES, TARGET, + BOOL, DATAFRAME, ENGINE, ESTIMATOR, FEATURES, FLOAT, INT, INT_TYPES, + SCALAR, SEQUENCE, SEQUENCE_TYPES, SERIES_TYPES, TARGET, ) from atom.utils.utils import ( CustomDict, check_is_fitted, check_scaling, composed, crash, @@ -844,7 +844,7 @@ class FeatureSelector( BaseEstimator, TransformerMixin, BaseTransformer, - FeatureSelectorPlot, + FeatureSelectionPlot, ): """Reduce the number of features in the data. @@ -1118,7 +1118,7 @@ def __init__( self, strategy: str | None = None, *, - solver: str | Callable | None = None, + solver: str | ESTIMATOR | None = None, n_features: SCALAR | None = None, min_repeated: SCALAR | None = 2, max_repeated: SCALAR | None = 1.0, diff --git a/atom/models/__init__.py b/atom/models/__init__.py new file mode 100644 index 000000000..54274dfa9 --- /dev/null +++ b/atom/models/__init__.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module for models. + +To add new models note the following: + +1. Add the class in the right file depending on task. +2. Models are ordered alphabetically. +3. Models have the following structure: + + Class attributes + ---------------- + acronym: str + Acronym of the model's name. + + needs_scaling: bool + Whether the model needs scaled features. + + accepts_sparse: bool + Whether the model has native support for sparse matrices. + + native_multilabel: bool + Whether the model has native support for multilabel tasks. + + native_multioutput: bool + Whether the model has native support for multioutput tasks. + + has_validation: str or None + Whether the model allows in-training validation. If str, + name of the estimator's parameter that states the number + of iterations. If None, no support for in-training + validation. + + supports_engines: list + Engines that can be used to run this model. + + _module: str + Module from which to load the class. If one of engines, + ignore the engine name, i.e. use "ensemble" instead of + "sklearn.ensemble". + + _estimators: CustomDict + Name of the estimators per goal. + + Instance attributes + ------------------- + name: str + Name of the model. Defaults to the same as the acronym + but can be different if the same model is called multiple + times. The name is assigned in the basemodel.py module. + + Methods + ------- + _get_parameters(self, x) -> CustomDict: + Return the trial's suggestions with rounded decimals and + (optionally) custom changes to the params. Don't implement + if the parent's implementation is sufficient. + + _trial_to_est(self, params) -> CustomDict: + Convert trial's hyperparameters to parameters for the + estimator. Only implement for models whose study params are + different from those for the estimator. + + _fit_estimator(self, estimator, data, est_params_fit, validation, trial): + This method is called to fit the estimator. Implement only + to customize the fit. + + _get_distributions(self) -> CustomDict: + Return a list of the hyperparameter distributions for + optimization. + +""" + +from atom.basemodel import ClassRegModel +from atom.models.classreg import ( + AdaBoost, AutomaticRelevanceDetermination, Bagging, BayesianRidge, + BernoulliNB, CatBoost, CategoricalNB, ComplementNB, DecisionTree, Dummy, + ElasticNet, ExtraTree, ExtraTrees, GaussianNB, GaussianProcess, + GradientBoostingMachine, HistGradientBoosting, HuberRegression, + KNearestNeighbors, Lasso, LeastAngleRegression, LightGBM, + LinearDiscriminantAnalysis, LinearSVM, LogisticRegression, + MultiLayerPerceptron, MultinomialNB, OrdinaryLeastSquares, + OrthogonalMatchingPursuit, PassiveAggressive, Perceptron, + QuadraticDiscriminantAnalysis, RadiusNearestNeighbors, RandomForest, Ridge, + StochasticGradientDescent, SupportVectorMachine, XGBoost, +) +from atom.models.ensembles import Stacking, Voting +from atom.models.ts import ( + ARIMA, ETS, AutoARIMA, ExponentialSmoothing, NaiveForecaster, + PolynomialTrend, +) +from atom.utils.types import PREDICTOR +from atom.utils.utils import ClassMap + + +# Available models +MODELS = ClassMap( + AdaBoost, + ARIMA, + AutoARIMA, + AutomaticRelevanceDetermination, + Bagging, + BayesianRidge, + BernoulliNB, + CatBoost, + CategoricalNB, + ComplementNB, + DecisionTree, + Dummy, + ElasticNet, + ETS, + ExponentialSmoothing, + ExtraTree, + ExtraTrees, + GaussianNB, + GaussianProcess, + GradientBoostingMachine, + HuberRegression, + HistGradientBoosting, + KNearestNeighbors, + Lasso, + LeastAngleRegression, + LightGBM, + LinearDiscriminantAnalysis, + LinearSVM, + LogisticRegression, + MultiLayerPerceptron, + MultinomialNB, + NaiveForecaster, + OrdinaryLeastSquares, + OrthogonalMatchingPursuit, + PassiveAggressive, + Perceptron, + PolynomialTrend, + QuadraticDiscriminantAnalysis, + RadiusNearestNeighbors, + RandomForest, + Ridge, + StochasticGradientDescent, + SupportVectorMachine, + XGBoost, + key="acronym", +) + +# Available ensembles +ENSEMBLES = ClassMap(Stacking, Voting, key="acronym") + +# Available models + ensembles +MODELS_ENSEMBLES = ClassMap(*MODELS, *ENSEMBLES, key="acronym") + + +class CustomModel(ClassRegModel): + """Model with estimator provided by user.""" + + def __init__(self, **kwargs): + if callable(est := kwargs.pop("estimator")): # Estimator provided by the user + self._est = est + self._params = {} + else: + self._est = est.__class__ + self._params = est.get_params() # Store the provided parameters + + if hasattr(est, "name"): + name = est.name + else: + # If no name is provided, use the name of the class + name = self._fullname + if len(n := list(filter(str.isupper, name))) >= 2 and n not in MODELS: + name = "".join(n) + + self.acronym = getattr(est, "acronym", name) + if not name.startswith(self.acronym): + raise ValueError( + f"The name ({name}) and acronym ({self.acronym}) of model " + f"{self._fullname} do not match. The name should start with " + f"the model's acronym." + ) + + self.needs_scaling = getattr(est, "needs_scaling", False) + self.native_multilabel = getattr(est, "native_multilabel", False) + self.native_multioutput = getattr(est, "native_multioutput", False) + self.has_validation = getattr(est, "has_validation", None) + + super().__init__(name=name, **kwargs) + + @property + def _fullname(self) -> str: + """Return the estimator's class name.""" + return self._est_class.__name__ + + @property + def _est_class(self): + """Return the estimator's class.""" + return self._est + + def _get_est(self, **params) -> PREDICTOR: + """Get the model's estimator with unpacked parameters. + + Returns + ------- + PREDICTOR + Estimator instance. + + """ + return super()._get_est(**{**self._params, **params}) diff --git a/atom/models.py b/atom/models/classreg.py similarity index 75% rename from atom/models.py rename to atom/models/classreg.py index c29b6e52a..abb83c5bb 100644 --- a/atom/models.py +++ b/atom/models/classreg.py @@ -1,4081 +1,3254 @@ -# -*- coding: utf-8 -*- - -""" -Automated Tool for Optimized Modelling (ATOM) -Author: Mavs -Description: Module containing all available models. The models are - ordered alphabetically. Classes must have the following - structure: - - Class attributes - ---------------- - acronym: str - Acronym of the model's name. - - needs_scaling: bool - Whether the model needs scaled features. - - accepts_sparse: bool - Whether the model has native support for sparse matrices. - - native_multilabel: bool - Whether the model has native support for multilabel tasks. - - native_multioutput: bool - Whether the model has native support for multioutput tasks. - - has_validation: str or None - Whether the model allows in-training validation. If str, - name of the estimator's parameter that states the number - of iterations. If None, no support for in-training - validation. - - supports_engines: list - Engines that can be used to run this model. - - _module: str - Module from which to load the class. If one of engines, - ignore the engine name, i.e. use "ensemble" instead of - "sklearn.ensemble". - - _estimators: CustomDict - Name of the estimators per goal. - - Instance attributes - ------------------- - name: str - Name of the model. Defaults to the same as the acronym - but can be different if the same model is called multiple - times. The name is assigned in the basemodel.py module. - - Methods - ------- - _get_parameters(self, x) -> CustomDict: - Return the trial's suggestions with rounded decimals and - (optionally) custom changes to the params. Don't implement - if the parent's implementation is sufficient. - - _trial_to_est(self, params) -> CustomDict: - Convert trial's hyperparameters to parameters for the - estimator. Only implement for models whose study params are - different than those for the estimator. - - _fit_estimator(self, estimator, data, est_params_fit, validation, trial): - This method is called to fit the estimator. Implement only - to customize the fit. - - _get_distributions(self) -> CustomDict: - Return a list of the hyperparameter distributions for - optimization. - -""" - -from __future__ import annotations - -import numpy as np -from optuna.distributions import CategoricalDistribution as Cat -from optuna.distributions import FloatDistribution as Float -from optuna.distributions import IntDistribution as Int -from optuna.exceptions import TrialPruned -from optuna.integration import ( - CatBoostPruningCallback, LightGBMPruningCallback, XGBoostPruningCallback, -) -from optuna.trial import Trial - -from atom.basemodel import ClassRegModel, ForecastModel -from atom.pipeline import Pipeline -from atom.utils.types import DATAFRAME, PREDICTOR, SERIES -from atom.utils.utils import ( - CatBMetric, ClassMap, CustomDict, LGBMetric, XGBMetric, sign, -) - - -# Custom models ==================================================== >> - -class CustomModel(ClassRegModel): - """Model with estimator provided by user.""" - - def __init__(self, **kwargs): - if callable(est := kwargs.pop("estimator")): # Estimator provided by the user - self._est = est - self._params = {} - else: - self._est = est.__class__ - self._params = est.get_params() # Store the provided parameters - - if hasattr(est, "name"): - name = est.name - else: - # If no name is provided, use the name of the class - name = self._fullname - if len(n := list(filter(str.isupper, name))) >= 2 and n not in MODELS: - name = "".join(n) - - self.acronym = getattr(est, "acronym", name) - if not name.startswith(self.acronym): - raise ValueError( - f"The name ({name}) and acronym ({self.acronym}) of model " - f"{self._fullname} do not match. The name should start with " - f"the model's acronym." - ) - - self.needs_scaling = getattr(est, "needs_scaling", False) - self.native_multilabel = getattr(est, "native_multilabel", False) - self.native_multioutput = getattr(est, "native_multioutput", False) - self.has_validation = getattr(est, "has_validation", None) - - super().__init__(name=name, **kwargs) - - @property - def _fullname(self) -> str: - """Return the estimator's class name.""" - return self._est_class.__name__ - - @property - def _est_class(self): - """Return the estimator's class.""" - return self._est - - def _get_est(self, **params) -> PREDICTOR: - """Get the model's estimator with unpacked parameters. - - Returns - ------- - PREDICTOR - Estimator instance. - - """ - return super()._get_est(**{**self._params, **params}) - - -# Classification and Regression models ============================= >> - -class AdaBoost(ClassRegModel): - """Adaptive Boosting (with decision tree as base estimator). - - AdaBoost is a meta-estimator that begins by fitting a - classifier/regressor on the original dataset and then fits - additional copies of the algorithm on the same dataset but where - the weights of instances are adjusted according to the error of - the current prediction. - - Corresponding estimators are: - - - [AdaBoostClassifier][] for classification tasks. - - [AdaBoostRegressor][] for regression tasks. - - Read more in sklearn's [documentation][adabdocs]. - - See Also - -------- - atom.models:GradientBoostingMachine - atom.models:RandomForest - atom.models:XGBoost - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="AdaB", metric="f1", verbose=2) - ``` - - """ - - acronym = "AdaB" - needs_scaling = False - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "ensemble" - _estimators = CustomDict({"class": "AdaBoostClassifier", "reg": "AdaBoostRegressor"}) - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - dist = CustomDict( - n_estimators=Int(50, 500, step=10), - learning_rate=Float(0.01, 10, log=True), - ) - - if self.goal == "class": - dist["algorithm"] = Cat(["SAMME.R", "SAMME"]) - else: - dist["loss"] = Cat(["linear", "square", "exponential"]) - - return dist - - -class AutomaticRelevanceDetermination(ClassRegModel): - """Automatic Relevance Determination. - - Automatic Relevance Determination is very similar to - [BayesianRidge][], but can lead to sparser coefficients. Fit the - weights of a regression model, using an ARD prior. The weights of - the regression model are assumed to be in Gaussian distributions. - - Corresponding estimators are: - - - [ARDRegression][] for regression tasks. - - Read more in sklearn's [documentation][arddocs]. - - See Also - -------- - atom.models:BayesianRidge - atom.models:GaussianProcess - atom.models:LeastAngleRegression - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import fetch_california_housing - - X, y = fetch_california_housing(return_X_y=True) - - atom = ATOMRegressor(X, y, random_state=1) - atom.run(models="ARD", metric="r2", verbose=2) - ``` - - """ - - acronym = "ARD" - needs_scaling = True - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "linear_model" - _estimators = CustomDict({"reg": "ARDRegression"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - n_iter=Int(100, 1000, step=10), - alpha_1=Float(1e-4, 1, log=True), - alpha_2=Float(1e-4, 1, log=True), - lambda_1=Float(1e-4, 1, log=True), - lambda_2=Float(1e-4, 1, log=True), - ) - - -class Bagging(ClassRegModel): - """Bagging model (with decision tree as base estimator). - - Bagging uses an ensemble meta-estimator that fits base predictors - on random subsets of the original dataset and then aggregate their - individual predictions (either by voting or by averaging) to form a - final prediction. Such a meta-estimator can typically be used as a - way to reduce the variance of a black-box estimator by introducing - randomization into its construction procedure and then making an - ensemble out of it. - - Corresponding estimators are: - - - [BaggingClassifier][] for classification tasks. - - [BaggingRegressor][] for regression tasks. - - Read more in sklearn's [documentation][bagdocs]. - - See Also - -------- - atom.models:DecisionTree - atom.models:LogisticRegression - atom.models:RandomForest - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="Bag", metric="f1", verbose=2) - ``` - - """ - - acronym = "Bag" - needs_scaling = False - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "ensemble" - _estimators = CustomDict({"class": "BaggingClassifier", "reg": "BaggingRegressor"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - n_estimators=Int(10, 500, step=10), - max_samples=Float(0.5, 1.0, step=0.1), - max_features=Float(0.5, 1.0, step=0.1), - bootstrap=Cat([True, False]), - bootstrap_features=Cat([True, False]), - ) - - -class BayesianRidge(ClassRegModel): - """Bayesian ridge regression. - - Bayesian regression techniques can be used to include regularization - parameters in the estimation procedure: the regularization parameter - is not set in a hard sense but tuned to the data at hand. - - Corresponding estimators are: - - - [BayesianRidge][bayesianridgeclass] for regression tasks. - - Read more in sklearn's [documentation][brdocs]. - - See Also - -------- - atom.models:AutomaticRelevanceDetermination - atom.models:GaussianProcess - atom.models:LeastAngleRegression - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import fetch_california_housing - - X, y = fetch_california_housing(return_X_y=True) - - atom = ATOMRegressor(X, y, random_state=1) - atom.run(models="BR", metric="r2", verbose=2) - ``` - - """ - - acronym = "BR" - needs_scaling = True - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "linear_model" - _estimators = CustomDict({"reg": "BayesianRidge"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - n_iter=Int(100, 1000, step=10), - alpha_1=Float(1e-4, 1, log=True), - alpha_2=Float(1e-4, 1, log=True), - lambda_1=Float(1e-4, 1, log=True), - lambda_2=Float(1e-4, 1, log=True), - ) - - -class BernoulliNB(ClassRegModel): - """Bernoulli Naive Bayes. - - BernoulliNB implements the Naive Bayes algorithm for multivariate - Bernoulli models. Like [MultinomialNB][], this classifier is - suitable for discrete data. The difference is that while MNB works - with occurrence counts, BNB is designed for binary/boolean features. - - Corresponding estimators are: - - - [BernoulliNB][bernoullinbclass] for classification tasks. - - Read more in sklearn's [documentation][bnbdocs]. - - See Also - -------- - atom.models:ComplementNB - atom.models:CategoricalNB - atom.models:MultinomialNB - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="BNB", metric="f1", verbose=2) - ``` - - """ - - acronym = "BNB" - needs_scaling = False - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "cuml"] - - _module = "naive_bayes" - _estimators = CustomDict({"class": "BernoulliNB"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - alpha=Float(0.01, 10, log=True), - fit_prior=Cat([True, False]), - ) - - -class CatBoost(ClassRegModel): - """Cat Boosting Machine. - - CatBoost is a machine learning method based on gradient boosting - over decision trees. Main advantages of CatBoost: - - - Superior quality when compared with other GBDT models on many - datasets. - - Best in class prediction speed. - - Corresponding estimators are: - - - [CatBoostClassifier][] for classification tasks. - - [CatBoostRegressor][] for regression tasks. - - Read more in CatBoost's [documentation][catbdocs]. - - !!! warning - * CatBoost selects the weights achieved by the best evaluation - on the test set after training. This means that, by default, - there is some minor data leakage in the test set. Use the - `use_best_model=False` parameter to avoid this behavior or use - a [holdout set][data-sets] to evaluate the final estimator. - * [In-training validation][] and [pruning][] are disabled when - `#!python device="gpu"`. - - !!! note - ATOM uses CatBoost's `n_estimators` parameter instead of - `iterations` to indicate the number of trees to fit. This is - done to have consistent naming with the [XGBoost][] and - [LightGBM][] models. - - See Also - -------- - atom.models:GradientBoostingMachine - atom.models:LightGBM - atom.models:XGBoost - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="CatB", metric="f1", verbose=2) - ``` - - """ - - acronym = "CatB" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = "n_estimators" - supports_engines = ["catboost"] - - _module = "catboost" - _estimators = CustomDict({"class": "CatBoostClassifier", "reg": "CatBoostRegressor"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if self._get_param("bootstrap_type", params) == "Bernoulli": - params.pop("bagging_temperature") - elif self._get_param("bootstrap_type", params) == "Bayesian": - params.pop("subsample") - - return params - - def _get_est(self, **params) -> PREDICTOR: - """Get the estimator instance. - - Parameters - ---------- - **params - Unpacked hyperparameters for the estimator. - - Returns - ------- - Predictor - Estimator instance. - - """ - eval_metric = None - if getattr(self, "_metric", None) and not self._gpu: - eval_metric = CatBMetric(self._metric[0], task=self.task) - - return self._est_class( - eval_metric=params.pop("eval_metric", eval_metric), - train_dir=params.pop("train_dir", ""), - allow_writing_files=params.pop("allow_writing_files", False), - thread_count=params.pop("n_jobs", self.n_jobs), - task_type=params.pop("task_type", "GPU" if self._gpu else "CPU"), - devices=str(self._device_id), - verbose=params.pop("verbose", False), - random_state=params.pop("random_state", self.random_state), - **params, - ) - - def _fit_estimator( - self, - estimator: PREDICTOR, - data: tuple[DATAFRAME, SERIES], - est_params_fit: dict, - validation: tuple[DATAFRAME, SERIES] | None = None, - trial: Trial | None = None, - ): - """Fit the estimator and perform in-training validation. - - Parameters - ---------- - estimator: Predictor - Instance to fit. - - data: tuple - Training data of the form (X, y). - - est_params_fit: dict - Additional parameters for the estimator's fit method. - - validation: tuple or None - Validation data of the form (X, y). If None, no validation - is performed. - - trial: [Trial][] or None - Active trial (during hyperparameter tuning). - - Returns - ------- - Predictor - Fitted instance. - - """ - params = est_params_fit.copy() - - callbacks = params.pop("callbacks", []) - if trial and len(self._metric) == 1 and not self._gpu: - callbacks.append(cb := CatBoostPruningCallback(trial, "CatBMetric")) - - # gpu implementation fails if callbacks!=None - estimator.fit(*data, eval_set=validation, callbacks=callbacks or None, **params) - - if not self._gpu: - if validation: - # Create evals attribute with train and validation scores - m = self._metric[0].name - evals = estimator.evals_result_ - self._evals[f"{m}_train"] = evals["learn"]["CatBMetric"] - self._evals[f"{m}_test"] = evals["validation"]["CatBMetric"] - - if trial and len(self._metric) == 1 and cb._pruned: - # Add the pruned step to the output - step = len(self.evals[f'{m}_train']) - steps = estimator.get_params()[self.has_validation] - trial.params[self.has_validation] = f"{step}/{steps}" - - trial.set_user_attr("estimator", estimator) - raise TrialPruned(cb._message) - - return estimator - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - n_estimators=Int(20, 500, step=10), - learning_rate=Float(0.01, 1.0, log=True), - max_depth=Cat([None, *range(1, 17)]), - min_child_samples=Int(1, 30), - bootstrap_type=Cat(["Bayesian", "Bernoulli"]), - bagging_temperature=Float(0, 10), - subsample=Float(0.5, 1.0, step=0.1), - reg_lambda=Float(0.001, 100, log=True), - ) - - -class CategoricalNB(ClassRegModel): - """Categorical Naive Bayes. - - Categorical Naive Bayes implements the Naive Bayes algorithm for - categorical features. - - Corresponding estimators are: - - - [CategoricalNB][categoricalnbclass] for classification tasks. - - Read more in sklearn's [documentation][catnbdocs]. - - See Also - -------- - atom.models:BernoulliNB - atom.models:ComplementNB - atom.models:GaussianNB - - Examples - -------- - ```pycon - from atom import ATOMClassifier - import numpy as np - - X = np.random.randint(5, size=(100, 100)) - y = np.random.randint(2, size=100) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="CatNB", metric="f1", verbose=2) - ``` - - """ - - acronym = "CatNB" - needs_scaling = False - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "cuml"] - - _module = "naive_bayes" - _estimators = CustomDict({"class": "CategoricalNB"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - alpha=Float(0.01, 10, log=True), - fit_prior=Cat([True, False]), - ) - - -class ComplementNB(ClassRegModel): - """Complement Naive Bayes. - - The Complement Naive Bayes classifier was designed to correct the - "severe assumptions" made by the standard [MultinomialNB][] - classifier. It is particularly suited for imbalanced datasets. - - Corresponding estimators are: - - - [ComplementNB][complementnbclass] for classification tasks. - - Read more in sklearn's [documentation][cnbdocs]. - - See Also - -------- - atom.models:BernoulliNB - atom.models:CategoricalNB - atom.models:MultinomialNB - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="CNB", metric="f1", verbose=2) - ``` - - """ - - acronym = "CNB" - needs_scaling = False - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "cuml"] - - _module = "naive_bayes" - _estimators = CustomDict({"class": "ComplementNB"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - alpha=Float(0.01, 10, log=True), - fit_prior=Cat([True, False]), - norm=Cat([True, False]), - ) - - -class DecisionTree(ClassRegModel): - """Single Decision Tree. - - A single decision tree classifier/regressor. - - Corresponding estimators are: - - - [DecisionTreeClassifier][] for classification tasks. - - [DecisionTreeRegressor][] for regression tasks. - - Read more in sklearn's [documentation][treedocs]. - - See Also - -------- - atom.models:ExtraTree - atom.models:ExtraTrees - atom.models:RandomForest - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="Tree", metric="f1", verbose=2) - ``` - - """ - - acronym = "Tree" - needs_scaling = False - accepts_sparse = True - native_multilabel = True - native_multioutput = True - has_validation = None - supports_engines = ["sklearn"] - - _module = "tree" - _estimators = CustomDict( - {"class": "DecisionTreeClassifier", "reg": "DecisionTreeRegressor"} - ) - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - if self.goal == "class": - criterion = ["gini", "entropy"] - else: - criterion = ["squared_error", "absolute_error", "friedman_mse", "poisson"] - - return CustomDict( - criterion=Cat(criterion), - splitter=Cat(["best", "random"]), - max_depth=Cat([None, *range(1, 17)]), - min_samples_split=Int(2, 20), - min_samples_leaf=Int(1, 20), - max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]), - ccp_alpha=Float(0, 0.035, step=0.005), - ) - - -class Dummy(ClassRegModel): - """Dummy classifier/regressor. - - When doing supervised learning, a simple sanity check consists of - comparing one's estimator against simple rules of thumb. The - prediction methods completely ignore the input data. Do not use - this model for real problems. Use it only as a simple baseline - to compare with other models. - - Corresponding estimators are: - - - [DummyClassifier][] for classification tasks. - - [DummyRegressor][] for regression tasks. - - Read more in sklearn's [documentation][dummydocs]. - - See Also - -------- - atom.models:DecisionTree - atom.models:ExtraTree - atom.models:NaiveForecaster - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="Dummy", metric="f1", verbose=2) - ``` - - """ - - acronym = "Dummy" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "dummy" - _estimators = CustomDict({"class": "DummyClassifier", "reg": "DummyRegressor"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if self._get_param("strategy", params) != "quantile": - params.pop("quantile") - - return params - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - dist = CustomDict() - if self.goal == "class": - dist["strategy"] = Cat(["most_frequent", "prior", "stratified", "uniform"]) - else: - dist["strategy"] = Cat(["mean", "median", "quantile"]) - dist["quantile"] = Float(0, 1.0, step=0.1) - - return dist - - -class ElasticNet(ClassRegModel): - """Linear Regression with elasticnet regularization. - - Linear least squares with l1 and l2 regularization. - - Corresponding estimators are: - - - [ElasticNet][elasticnetreg] for regression tasks. - - Read more in sklearn's [documentation][endocs]. - - See Also - -------- - atom.models:Lasso - atom.models:OrdinaryLeastSquares - atom.models:Ridge - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import fetch_california_housing - - X, y = fetch_california_housing(return_X_y=True) - - atom = ATOMRegressor(X, y, random_state=1) - atom.run(models="EN", metric="r2", verbose=2) - ``` - - """ - - acronym = "EN" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "sklearnex", "cuml"] - - _module = "linear_model" - _estimators = CustomDict({"reg": "ElasticNet"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - alpha=Float(1e-3, 10, log=True), - l1_ratio=Float(0.1, 0.9, step=0.1), - selection=Cat(["cyclic", "random"]), - ) - - -class ExtraTree(ClassRegModel): - """Extremely Randomized Tree. - - Extra-trees differ from classic decision trees in the way they are - built. When looking for the best split to separate the samples of a - node into two groups, random splits are drawn for each of the - max_features randomly selected features and the best split among - those is chosen. When max_features is set 1, this amounts to - building a totally random decision tree. - - Corresponding estimators are: - - - [ExtraTreeClassifier][] for classification tasks. - - [ExtraTreeRegressor][] for regression tasks. - - Read more in sklearn's [documentation][treedocs]. - - See Also - -------- - atom.models:DecisionTree - atom.models:ExtraTrees - atom.models:RandomForest - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="ETree", metric="f1", verbose=2) - ``` - - """ - - acronym = "ETree" - needs_scaling = False - accepts_sparse = True - native_multilabel = True - native_multioutput = True - has_validation = None - supports_engines = ["sklearn"] - - _module = "tree" - _estimators = CustomDict( - {"class": "ExtraTreeClassifier", "reg": "ExtraTreeRegressor"} - ) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if not self._get_param("bootstrap", params): - params.pop("max_samples") - - return params - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - if self.goal == "class": - criterion = ["gini", "entropy"] - else: - criterion = ["squared_error", "absolute_error"] - - return CustomDict( - criterion=Cat(criterion), - splitter=Cat(["random", "best"]), - max_depth=Cat([None, *range(1, 17)]), - min_samples_split=Int(2, 20), - min_samples_leaf=Int(1, 20), - max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]), - ccp_alpha=Float(0, 0.035, step=0.005), - ) - - -class ExtraTrees(ClassRegModel): - """Extremely Randomized Trees. - - Extra-Trees use a meta estimator that fits a number of randomized - decision trees (a.k.a. [extra-trees][extratree]) on various - sub-samples of the dataset and uses averaging to improve the - predictive accuracy and control over-fitting. - - Corresponding estimators are: - - - [ExtraTreesClassifier][] for classification tasks. - - [ExtraTreesRegressor][] for regression tasks. - - Read more in sklearn's [documentation][etdocs]. - - See Also - -------- - atom.models:DecisionTree - atom.models:ExtraTree - atom.models:RandomForest - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="ET", metric="f1", verbose=2) - ``` - - """ - - acronym = "ET" - needs_scaling = False - accepts_sparse = True - native_multilabel = True - native_multioutput = True - has_validation = None - supports_engines = ["sklearn"] - - _module = "ensemble" - _estimators = CustomDict( - {"class": "ExtraTreesClassifier", "reg": "ExtraTreesRegressor"} - ) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if not self._get_param("bootstrap", params): - params.pop("max_samples") - - return params - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - if self.goal == "class": - criterion = ["gini", "entropy"] - else: - criterion = ["squared_error", "absolute_error"] - - return CustomDict( - n_estimators=Int(10, 500, step=10), - criterion=Cat(criterion), - max_depth=Cat([None, *range(1, 17)]), - min_samples_split=Int(2, 20), - min_samples_leaf=Int(1, 20), - max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]), - bootstrap=Cat([True, False]), - max_samples=Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]), - ccp_alpha=Float(0, 0.035, step=0.005), - ) - - -class GaussianNB(ClassRegModel): - """Gaussian Naive Bayes. - - Gaussian Naive Bayes implements the Naive Bayes algorithm for - classification. The likelihood of the features is assumed to - be Gaussian. - - Corresponding estimators are: - - - [GaussianNB][gaussiannbclass] for classification tasks. - - Read more in sklearn's [documentation][gnbdocs]. - - See Also - -------- - atom.models:BernoulliNB - atom.models:CategoricalNB - atom.models:ComplementNB - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="GNB", metric="f1", verbose=2) - ``` - - """ - - acronym = "GNB" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "cuml"] - - _module = "naive_bayes" - _estimators = CustomDict({"class": "GaussianNB"}) - - -class GaussianProcess(ClassRegModel): - """Gaussian process. - - Gaussian Processes are a generic supervised learning method - designed to solve regression and probabilistic classification - problems. The advantages of Gaussian processes are: - - * The prediction interpolates the observations. - * The prediction is probabilistic (Gaussian) so that one can compute - empirical confidence intervals and decide based on those if one - should refit (online fitting, adaptive fitting) the prediction in - some region of interest. - - The disadvantages of Gaussian processes include: - - * They are not sparse, i.e. they use the whole samples/features - information to perform the prediction. - * They lose efficiency in high dimensional spaces, namely when the - number of features exceeds a few dozens. - - Corresponding estimators are: - - - [GaussianProcessClassifier][] for classification tasks. - - [GaussianProcessRegressor][] for regression tasks. - - Read more in sklearn's [documentation][gpdocs]. - - See Also - -------- - atom.models:GaussianNB - atom.models:LinearDiscriminantAnalysis - atom.models:PassiveAggressive - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="GP", metric="f1", verbose=2) - ``` - - """ - - acronym = "GP" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "gaussian_process" - _estimators = CustomDict( - {"class": "GaussianProcessClassifier", "reg": "GaussianProcessRegressor"} - ) - - -class GradientBoostingMachine(ClassRegModel): - """Gradient Boosting Machine. - - A Gradient Boosting Machine builds an additive model in a forward - stage-wise fashion; it allows for the optimization of arbitrary - differentiable loss functions. In each stage `n_classes_` regression - trees are fit on the negative gradient of the loss function, e.g. - binary or multiclass log loss. Binary classification is a special - case where only a single regression tree is induced. - - Corresponding estimators are: - - - [GradientBoostingClassifier][] for classification tasks. - - [GradientBoostingRegressor][] for regression tasks. - - Read more in sklearn's [documentation][gbmdocs]. - - !!! tip - [HistGradientBoosting][] is a much faster variant of this - algorithm for intermediate datasets (n_samples >= 10k). - - See Also - -------- - atom.models:CatBoost - atom.models:HistGradientBoosting - atom.models:LightGBM - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="GBM", metric="f1", verbose=2) - ``` - - """ - - acronym = "GBM" - needs_scaling = False - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "ensemble" - _estimators = CustomDict( - {"class": "GradientBoostingClassifier", "reg": "GradientBoostingRegressor"} - ) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if self._get_param("loss", params) not in ("huber", "quantile"): - params.pop("alpha") - - return params - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - dist = CustomDict( - loss=Cat(["log_loss", "exponential"]), - learning_rate=Float(0.01, 1.0, log=True), - n_estimators=Int(10, 500, step=10), - subsample=Float(0.5, 1.0, step=0.1), - criterion=Cat(["friedman_mse", "squared_error"]), - min_samples_split=Int(2, 20), - min_samples_leaf=Int(1, 20), - max_depth=Int(1, 21), - max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]), - ccp_alpha=Float(0, 0.035, step=0.005), - ) - - if self.task.startswith("multiclass"): - dist.pop("loss") # Multiclass only supports log_loss - elif self.goal.startswith("reg"): - dist["loss"] = Cat(["squared_error", "absolute_error", "huber", "quantile"]) - dist["alpha"] = Float(0.1, 0.9, step=0.1) - - return dist - - -class HuberRegression(ClassRegModel): - """Huber regressor. - - Huber is a linear regression model that is robust to outliers. It - makes sure that the loss function is not heavily influenced by the - outliers while not completely ignoring their effect. - - Corresponding estimators are: - - - [HuberRegressor][] for regression tasks. - - Read more in sklearn's [documentation][huberdocs]. - - See Also - -------- - atom.models:AutomaticRelevanceDetermination - atom.models:LeastAngleRegression - atom.models:OrdinaryLeastSquares - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import fetch_california_housing - - X, y = fetch_california_housing(return_X_y=True) - - atom = ATOMRegressor(X, y, random_state=1) - atom.run(models="Huber", metric="r2", verbose=2) - ``` - - """ - - acronym = "Huber" - needs_scaling = True - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "linear_model" - _estimators = CustomDict({"reg": "HuberRegressor"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - epsilon=Float(1, 10, log=True), - max_iter=Int(50, 500, step=10), - alpha=Float(1e-4, 1, log=True), - ) - - -class HistGradientBoosting(ClassRegModel): - """Histogram-based Gradient Boosting Machine. - - This Histogram-based Gradient Boosting Machine is much faster than - the standard [GradientBoostingMachine][] for big datasets - (n_samples>=10k). This variation first bins the input samples into - integer-valued bins which tremendously reduces the number of - splitting points to consider, and allows the algorithm to leverage - integer-based data structures (histograms) instead of relying on - sorted continuous values when building the trees. - - Corresponding estimators are: - - - [HistGradientBoostingClassifier][] for classification tasks. - - [HistGradientBoostingRegressor][] for regression tasks. - - Read more in sklearn's [documentation][hgbmdocs]. - - See Also - -------- - atom.models:CatBoost - atom.models:GradientBoostingMachine - atom.models:XGBoost - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="hGBM", metric="f1", verbose=2) - ``` - - """ - - acronym = "hGBM" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "ensemble" - _estimators = CustomDict( - { - "class": "HistGradientBoostingClassifier", - "reg": "HistGradientBoostingRegressor", - } - ) - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - dist = CustomDict( - loss=Cat(["squared_error", "absolute_error", "poisson", "quantile", "gamma"]), - learning_rate=Float(0.01, 1.0, log=True), - max_iter=Int(10, 500, step=10), - max_leaf_nodes=Int(10, 50), - max_depth=Cat([None, *range(1, 17)]), - min_samples_leaf=Int(10, 30), - l2_regularization=Float(0, 1.0, step=0.1), - ) - - if self.goal == "class": - dist.pop("loss") - - return dist - - -class KNearestNeighbors(ClassRegModel): - """K-Nearest Neighbors. - - K-Nearest Neighbors, as the name clearly indicates, implements the - k-nearest neighbors vote. For regression, the target is predicted - by local interpolation of the targets associated of the nearest - neighbors in the training set. - - Corresponding estimators are: - - - [KNeighborsClassifier][] for classification tasks. - - [KNeighborsRegressor][] for classification tasks. - - Read more in sklearn's [documentation][knndocs]. - - See Also - -------- - atom.models:LinearDiscriminantAnalysis - atom.models:QuadraticDiscriminantAnalysis - atom.models:RadiusNearestNeighbors - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="KNN", metric="f1", verbose=2) - ``` - - """ - - acronym = "KNN" - needs_scaling = True - accepts_sparse = True - native_multilabel = True - native_multioutput = True - has_validation = None - supports_engines = ["sklearn", "sklearnex", "cuml"] - - _module = "neighbors" - _estimators = CustomDict( - {"class": "KNeighborsClassifier", "reg": "KNeighborsRegressor"} - ) - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - dist = CustomDict( - n_neighbors=Int(1, 100), - weights=Cat(["uniform", "distance"]), - algorithm=Cat(["auto", "ball_tree", "kd_tree", "brute"]), - leaf_size=Int(20, 40), - p=Int(1, 2), - ) - - if self._gpu: - dist.pop("algorithm") # Only 'brute' is supported - if self.engine["estimator"] == "cuml": - dist.pop("weights") # Only 'uniform' is supported - dist.pop("leaf_size") - dist.pop("p") - - return dist - - -class Lasso(ClassRegModel): - """Linear Regression with lasso regularization. - - Linear least squares with l1 regularization. - - Corresponding estimators are: - - - [Lasso][lassoreg] for regression tasks. - - Read more in sklearn's [documentation][lassodocs]. - - See Also - -------- - atom.models:ElasticNet - atom.models:OrdinaryLeastSquares - atom.models:Ridge - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import fetch_california_housing - - X, y = fetch_california_housing(return_X_y=True) - - atom = ATOMRegressor(X, y, random_state=1) - atom.run(models="Lasso", metric="r2", verbose=2) - ``` - - """ - - acronym = "Lasso" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "sklearnex", "cuml"] - - _module = "linear_model" - _estimators = CustomDict({"reg": "Lasso"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - alpha=Float(1e-3, 10, log=True), - selection=Cat(["cyclic", "random"]), - ) - - -class LeastAngleRegression(ClassRegModel): - """Least Angle Regression. - - Least-Angle Regression is a regression algorithm for - high-dimensional data. Lars is similar to forward stepwise - regression. At each step, it finds the feature most correlated - with the target. When there are multiple features having equal - correlation, instead of continuing along the same feature, it - proceeds in a direction equiangular between the features. - - Corresponding estimators are: - - - [Lars][] for regression tasks. - - Read more in sklearn's [documentation][larsdocs]. - - See Also - -------- - atom.models:BayesianRidge - atom.models:HuberRegression - atom.models:OrdinaryLeastSquares - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import fetch_california_housing - - X, y = fetch_california_housing(return_X_y=True) - - atom = ATOMRegressor(X, y, random_state=1) - atom.run(models="Lars", metric="r2", verbose=2) - ``` - - """ - - acronym = "Lars" - needs_scaling = True - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "linear_model" - _estimators = CustomDict({"reg": "Lars"}) - - -class LightGBM(ClassRegModel): - """Light Gradient Boosting Machine. - - LightGBM is a gradient boosting model that uses tree based learning - algorithms. It is designed to be distributed and efficient with the - following advantages: - - - Faster training speed and higher efficiency. - - Lower memory usage. - - Better accuracy. - - Capable of handling large-scale data. - - Corresponding estimators are: - - - [LGBMClassifier][] for classification tasks. - - [LGBMRegressor][] for regression tasks. - - Read more in LightGBM's [documentation][lgbdocs]. - - !!! info - Using LightGBM's [GPU acceleration][estimator-acceleration] - requires [additional software dependencies][lgb_gpu]. - - See Also - -------- - atom.models:CatBoost - atom.models:GradientBoostingMachine - atom.models:XGBoost - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="LGB", metric="f1", verbose=2) - ``` - - """ - - acronym = "LGB" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = "n_estimators" - supports_engines = ["lightgbm"] - - _module = "lightgbm.sklearn" - _estimators = CustomDict({"class": "LGBMClassifier", "reg": "LGBMRegressor"}) - - def _get_est(self, **params) -> PREDICTOR: - """Get the model's estimator with unpacked parameters. - - Returns - ------- - Predictor - Estimator instance. - - """ - # Custom lightgbm mapping for warnings - # PYTHONWARNINGS doesn't work since they go from C/C++ code to stdout - warns = dict(always=2, default=1, error=0, ignore=-1) - - return self._est_class( - verbose=params.pop("verbose", warns.get(self.warnings, -1)), - n_jobs=params.pop("n_jobs", self.n_jobs), - device=params.pop("device", "gpu" if self._gpu else "cpu"), - gpu_device_id=params.pop("gpu_device_id", self._device_id or -1), - random_state=params.pop("random_state", self.random_state), - **params, - ) - - def _fit_estimator( - self, - estimator: PREDICTOR, - data: tuple[DATAFRAME, SERIES], - est_params_fit: dict, - validation: tuple[DATAFRAME, SERIES] | None = None, - trial: Trial | None = None, - ): - """Fit the estimator and perform in-training validation. - - Parameters - ---------- - estimator: Predictor - Instance to fit. - - data: tuple - Training data of the form (X, y). - - est_params_fit: dict - Additional parameters for the estimator's fit method. - - validation: tuple or None - Validation data of the form (X, y). If None, no validation - is performed. - - trial: [Trial][] or None - Active trial (during hyperparameter tuning). - - Returns - ------- - Predictor - Fitted instance. - - """ - from lightgbm.callback import log_evaluation - - m = self._metric[0].name - params = est_params_fit.copy() - - callbacks = params.pop("callbacks", []) + [log_evaluation(-1)] - if trial and len(self._metric) == 1: - callbacks.append(LightGBMPruningCallback(trial, m, "valid_1")) - - eval_metric = None - if getattr(self, "_metric", None): - eval_metric = LGBMetric(self._metric[0], task=self.task) - - try: - estimator.fit( - *data, - eval_set=[data, validation] if validation else None, - eval_metric=params.pop("eval_metric", eval_metric), - callbacks=callbacks, - **params, - ) - except TrialPruned as ex: - # Add the pruned step to the output - step = str(ex).split(" ")[-1][:-1] - steps = estimator.get_params()[self.has_validation] - trial.params[self.has_validation] = f"{step}/{steps}" - - trial.set_user_attr("estimator", estimator) - raise ex - - if validation: - # Create evals attribute with train and validation scores - self._evals[f"{m}_train"] = estimator.evals_result_["training"][m] - self._evals[f"{m}_test"] = estimator.evals_result_["valid_1"][m] - - return estimator - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - n_estimators=Int(20, 500, step=10), - learning_rate=Float(0.01, 1.0, log=True), - max_depth=Int(-1, 17, step=2), - num_leaves=Int(20, 40), - min_child_weight=Float(1e-4, 100, log=True), - min_child_samples=Int(1, 30), - subsample=Float(0.5, 1.0, step=0.1), - colsample_bytree=Float(0.4, 1.0, step=0.1), - reg_alpha=Float(1e-4, 100, log=True), - reg_lambda=Float(1e-4, 100, log=True), - ) - - -class LinearDiscriminantAnalysis(ClassRegModel): - """Linear Discriminant Analysis. - - Linear Discriminant Analysis is a classifier with a linear - decision boundary, generated by fitting class conditional densities - to the data and using Bayes’ rule. The model fits a Gaussian - density to each class, assuming that all classes share the same - covariance matrix. - - Corresponding estimators are: - - - [LinearDiscriminantAnalysis][ldaclassifier] for classification tasks. - - Read more in sklearn's [documentation][ldadocs]. - - See Also - -------- - atom.models:LogisticRegression - atom.models:RadiusNearestNeighbors - atom.models:QuadraticDiscriminantAnalysis - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="LDA", metric="f1", verbose=2) - ``` - - """ - - acronym = "LDA" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "discriminant_analysis" - _estimators = CustomDict({"class": "LinearDiscriminantAnalysis"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if self._get_param("solver", params) == "svd": - params.pop("shrinkage") - - return params - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - solver=Cat(["svd", "lsqr", "eigen"]), - shrinkage=Cat([None, "auto", 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]), - ) - - -class LinearSVM(ClassRegModel): - """Linear Support Vector Machine. - - Similar to [SupportVectorMachine][] but with a linear kernel. - Implemented in terms of liblinear rather than libsvm, so it has - more flexibility in the choice of penalties and loss functions and - should scale better to large numbers of samples. - - Corresponding estimators are: - - - [LinearSVC][] for classification tasks. - - [LinearSVR][] for classification tasks. - - Read more in sklearn's [documentation][svmdocs]. - - See Also - -------- - atom.models:KNearestNeighbors - atom.models:StochasticGradientDescent - atom.models:SupportVectorMachine - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="lSVM", metric="f1", verbose=2) - ``` - - """ - - acronym = "lSVM" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "cuml"] - - _module = "svm" - _estimators = CustomDict({"class": "LinearSVC", "reg": "LinearSVR"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if self.goal == "class": - if self._get_param("loss", params) == "hinge": - # l1 regularization can't be combined with hinge - params.replace_value("penalty", "l2") - # l2 regularization can't be combined with hinge when dual=False - params.replace_value("dual", True) - elif self._get_param("loss", params) == "squared_hinge": - # l1 regularization can't be combined with squared_hinge when dual=True - if self._get_param("penalty", params) == "l1": - params.replace_value("dual", False) - elif self._get_param("loss", params) == "epsilon_insensitive": - params.replace_value("dual", True) - - return params - - def _get_est(self, **params) -> PREDICTOR: - """Get the estimator instance. - - Parameters - ---------- - **params - Unpacked hyperparameters for the estimator. - - Returns - ------- - Predictor - Estimator instance. - - """ - if self.engine["estimator"] == "cuml" and self.goal == "class": - return self._est_class(probability=params.pop("probability", True), **params) - else: - return super()._get_est(**params) - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - dist = CustomDict() - if self.goal == "class": - dist["penalty"] = Cat(["l1", "l2"]) - dist["loss"] = Cat(["hinge", "squared_hinge"]) - else: - dist["loss"] = Cat(["epsilon_insensitive", "squared_epsilon_insensitive"]) - - dist["C"] = Float(1e-3, 100, log=True) - dist["dual"] = Cat([True, False]) - - if self.engine["estimator"] == "cuml": - dist.pop("dual") - - return dist - - -class LogisticRegression(ClassRegModel): - """Logistic Regression. - - Logistic regression, despite its name, is a linear model for - classification rather than regression. Logistic regression is also - known in the literature as logit regression, maximum-entropy - classification (MaxEnt) or the log-linear classifier. In this model, - the probabilities describing the possible outcomes of a single trial - are modeled using a logistic function. - - Corresponding estimators are: - - - [LogisticRegression][] for classification tasks. - - Read more in sklearn's [documentation][lrdocs]. - - See Also - -------- - atom.models:GaussianProcess - atom.models:LinearDiscriminantAnalysis - atom.models:PassiveAggressive - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="RF", metric="f1", verbose=2) - ``` - - """ - - acronym = "LR" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "sklearnex", "cuml"] - - _module = "linear_model" - _estimators = CustomDict({"class": "LogisticRegression"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - # Limitations on penalty + solver combinations - penalty = self._get_param("penalty", params) - solver = self._get_param("solver", params) - cond_1 = penalty is None and solver == "liblinear" - cond_2 = penalty == "l1" and solver not in ("liblinear", "saga") - cond_3 = penalty == "elasticnet" and solver != "saga" - - if cond_1 or cond_2 or cond_3: - params.replace_value("penalty", "l2") # Change to default value - - if self._get_param("penalty", params) != "elasticnet": - params.pop("l1_ratio") - - if self._get_param("penalty", params) is None: - params.pop("C") - - return params - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - dist = CustomDict( - penalty=Cat([None, "l1", "l2", "elasticnet"]), - C=Float(1e-3, 100, log=True), - solver=Cat(["lbfgs", "newton-cg", "liblinear", "sag", "saga"]), - max_iter=Int(100, 1000, step=10), - l1_ratio=Float(0, 1.0, step=0.1), - ) - - if self._gpu: - dist.pop("solver") - dist.pop("penalty") # Only 'l2' is supported - elif self.engine["estimator"] == "sklearnex": - dist["solver"] = Cat(["lbfgs", "newton-cg"]) - - return dist - - -class MultiLayerPerceptron(ClassRegModel): - """Multi-layer Perceptron. - - Multi-layer Perceptron is a supervised learning algorithm that - learns a function by training on a dataset. Given a set of features - and a target, it can learn a non-linear function approximator for - either classification or regression. It is different from logistic - regression, in that between the input and the output layer, there - can be one or more non-linear layers, called hidden layers. - - Corresponding estimators are: - - - [MLPClassifier][] for classification tasks. - - [MLPRegressor][] for regression tasks. - - Read more in sklearn's [documentation][mlpdocs]. - - See Also - -------- - atom.models:PassiveAggressive - atom.models:Perceptron - atom.models:StochasticGradientDescent - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="MLP", metric="f1", verbose=2) - ``` - - """ - - acronym = "MLP" - needs_scaling = True - accepts_sparse = True - native_multilabel = True - native_multioutput = False - has_validation = "max_iter" - supports_engines = ["sklearn"] - - _module = "neural_network" - _estimators = CustomDict({"class": "MLPClassifier", "reg": "MLPRegressor"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - # Drop layers when a previous layer has 0 neurons - drop = False - for param in [p for p in sorted(params) if p.startswith("hidden_layer")]: - if params[param] == 0 or drop: - drop = True - params.pop(param) - - if self._get_param("solver", params) != "sgd": - params.pop("learning_rate") - params.pop("power_t") - else: - params.pop("learning_rate_init") - - return params - - def _trial_to_est(self, params: CustomDict) -> CustomDict: - """Convert trial's hyperparameters to parameters for the estimator. - - Parameters - ---------- - params: CustomDict - Trial's hyperparameters. - - Returns - ------- - CustomDict - Estimator's hyperparameters. - - """ - params = super()._trial_to_est(params) - - hidden_layer_sizes = [] - for param in [p for p in sorted(params) if p.startswith("hidden_layer")]: - hidden_layer_sizes.append(params.pop(param)) - - if hidden_layer_sizes: - params.insert(0, "hidden_layer_sizes", tuple(hidden_layer_sizes)) - - return params - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - dist = CustomDict( - hidden_layer_1=Int(10, 100), - hidden_layer_2=Int(0, 100), - hidden_layer_3=Int(0, 10), - activation=Cat(["identity", "logistic", "tanh", "relu"]), - solver=Cat(["lbfgs", "sgd", "adam"]), - alpha=Float(1e-4, 0.1, log=True), - batch_size=Cat(["auto", 8, 16, 32, 64, 128, 256]), - learning_rate=Cat(["constant", "invscaling", "adaptive"]), - learning_rate_init=Float(1e-3, 0.1, log=True), - power_t=Float(0.1, 0.9, step=0.1), - max_iter=Int(50, 500, step=10), - ) - - # Drop layers if sizes are specified by user - return dist[3:] if "hidden_layer_sizes" in self._est_params else dist - - -class MultinomialNB(ClassRegModel): - """Multinomial Naive Bayes. - - MultinomialNB implements the Naive Bayes algorithm for multinomially - distributed data, and is one of the two classic Naive Bayes variants - used in text classification (where the data are typically - represented as word vector counts, although tf-idf vectors are also - known to work well in practice). - - Corresponding estimators are: - - - [MultinomialNB][multinomialnbclass] for classification tasks. - - Read more in sklearn's [documentation][mnbdocs]. - - See Also - -------- - atom.models:BernoulliNB - atom.models:ComplementNB - atom.models:GaussianNB - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="MNB", metric="f1", verbose=2) - ``` - - """ - - acronym = "MNB" - needs_scaling = False - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "cuml"] - - _module = "naive_bayes" - _estimators = CustomDict({"class": "MultinomialNB"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - alpha=Float(0.01, 10, log=True), - fit_prior=Cat([True, False]), - ) - - -class OrdinaryLeastSquares(ClassRegModel): - """Linear Regression. - - Ordinary Least Squares is just linear regression without any - regularization. It fits a linear model with coefficients `w=(w1, - ..., wp)` to minimize the residual sum of squares between the - observed targets in the dataset, and the targets predicted by the - linear approximation. - - Corresponding estimators are: - - - [LinearRegression][] for regression tasks. - - Read more in sklearn's [documentation][olsdocs]. - - See Also - -------- - atom.models:ElasticNet - atom.models:Lasso - atom.models:Ridge - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import fetch_california_housing - - X, y = fetch_california_housing(return_X_y=True) - - atom = ATOMRegressor(X, y, random_state=1) - atom.run(models="OLS", metric="r2", verbose=2) - ``` - - """ - - acronym = "OLS" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "sklearnex", "cuml"] - - _module = "linear_model" - _estimators = CustomDict({"reg": "LinearRegression"}) - - -class OrthogonalMatchingPursuit(ClassRegModel): - """Orthogonal Matching Pursuit. - - Orthogonal Matching Pursuit implements the OMP algorithm for - approximating the fit of a linear model with constraints imposed - on the number of non-zero coefficients. - - Corresponding estimators are: - - - [OrthogonalMatchingPursuit][] for regression tasks. - - Read more in sklearn's [documentation][ompdocs]. - - See Also - -------- - atom.models:Lasso - atom.models:LeastAngleRegression - atom.models:OrdinaryLeastSquares - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import fetch_california_housing - - X, y = fetch_california_housing(return_X_y=True) - - atom = ATOMRegressor(X, y, random_state=1) - atom.run(models="OMP", metric="r2", verbose=2) - ``` - - """ - - acronym = "OMP" - needs_scaling = True - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "linear_model" - _estimators = CustomDict({"reg": "OrthogonalMatchingPursuit"}) - - -class PassiveAggressive(ClassRegModel): - """Passive Aggressive. - - The passive-aggressive algorithms are a family of algorithms for - large-scale learning. They are similar to the Perceptron in that - they do not require a learning rate. However, contrary to the - [Perceptron][], they include a regularization parameter `C`. - - Corresponding estimators are: - - - [PassiveAggressiveClassifier][] for classification tasks. - - [PassiveAggressiveRegressor][] for classification tasks. - - Read more in sklearn's [documentation][padocs]. - - See Also - -------- - atom.models:MultiLayerPerceptron - atom.models:Perceptron - atom.models:StochasticGradientDescent - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="PA", metric="f1", verbose=2) - ``` - - """ - - acronym = "PA" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = "max_iter" - supports_engines = ["sklearn"] - - _module = "linear_model" - _estimators = CustomDict( - {"class": "PassiveAggressiveClassifier", "reg": "PassiveAggressiveRegressor"} - ) - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - if self.goal == "class": - loss = ["hinge", "squared_hinge"] - else: - loss = ["epsilon_insensitive", "squared_epsilon_insensitive"] - - return CustomDict( - C=Float(1e-3, 100, log=True), - max_iter=Int(500, 1500, step=50), - loss=Cat(loss), - average=Cat([True, False]), - ) - - -class Perceptron(ClassRegModel): - """Linear Perceptron classification. - - The Perceptron is a simple classification algorithm suitable for - large scale learning. By default: - - * It does not require a learning rate. - * It is not regularized (penalized). - * It updates its model only on mistakes. - - The last characteristic implies that the Perceptron is slightly - faster to train than [StochasticGradientDescent][] with the hinge - loss and that the resulting models are sparser. - - Corresponding estimators are: - - - [Perceptron][percclassifier] for classification tasks. - - Read more in sklearn's [documentation][percdocs]. - - See Also - -------- - atom.models:MultiLayerPerceptron - atom.models:PassiveAggressive - atom.models:StochasticGradientDescent - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="Perc", metric="f1", verbose=2) - ``` - - """ - - acronym = "Perc" - needs_scaling = True - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = "max_iter" - supports_engines = ["sklearn"] - - _module = "linear_model" - _estimators = CustomDict({"class": "Perceptron"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if self._get_param("penalty", params) != "elasticnet": - params.pop("l1_ratio") - - return params - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - penalty=Cat([None, "l2", "l1", "elasticnet"]), - alpha=Float(1e-4, 10, log=True), - l1_ratio=Float(0.1, 0.9, step=0.1), - max_iter=Int(500, 1500, step=50), - eta0=Float(1e-2, 10, log=True), - ) - - -class QuadraticDiscriminantAnalysis(ClassRegModel): - """Quadratic Discriminant Analysis. - - Quadratic Discriminant Analysis is a classifier with a quadratic - decision boundary, generated by fitting class conditional densities - to the data and using Bayes’ rule. The model fits a Gaussian - density to each class, assuming that all classes share the same - covariance matrix. - - Corresponding estimators are: - - - [QuadraticDiscriminantAnalysis][qdaclassifier] for classification tasks. - - Read more in sklearn's [documentation][ldadocs]. - - See Also - -------- - atom.models:LinearDiscriminantAnalysis - atom.models:LogisticRegression - atom.models:RadiusNearestNeighbors - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="QDA", metric="f1", verbose=2) - ``` - - """ - - acronym = "QDA" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn"] - - _module = "discriminant_analysis" - _estimators = CustomDict({"class": "QuadraticDiscriminantAnalysis"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict(reg_param=Float(0, 1.0, step=0.1)) - - -class RadiusNearestNeighbors(ClassRegModel): - """Radius Nearest Neighbors. - - Radius Nearest Neighbors implements the nearest neighbors vote, - where the neighbors are selected from within a given radius. For - regression, the target is predicted by local interpolation of the - targets associated of the nearest neighbors in the training set. - - !!! warning - * The `radius` parameter should be tuned to the data at hand or - the model will perform poorly. - * If outliers are detected, the estimator raises an exception - unless `est_params={"outlier_label": "most_frequent"}` is used. - - Corresponding estimators are: - - - [RadiusNeighborsClassifier][] for classification tasks. - - [RadiusNeighborsRegressor][] for regression tasks. - - Read more in sklearn's [documentation][knndocs]. - - See Also - -------- - atom.models:KNearestNeighbors - atom.models:LinearDiscriminantAnalysis - atom.models:QuadraticDiscriminantAnalysis - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run( - models="RNN", - metric="f1", - est_params={"outlier_label": "most_frequent"}, - verbose=2, - ) - ``` - - """ - - acronym = "RNN" - needs_scaling = True - accepts_sparse = True - native_multilabel = True - native_multioutput = True - has_validation = None - supports_engines = ["sklearn"] - - _module = "neighbors" - _estimators = CustomDict( - {"class": "RadiusNeighborsClassifier", "reg": "RadiusNeighborsRegressor"} - ) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - radius=Float(1e-2, 100), - weights=Cat(["uniform", "distance"]), - algorithm=Cat(["auto", "ball_tree", "kd_tree", "brute"]), - leaf_size=Int(20, 40), - p=Int(1, 2), - ) - - -class RandomForest(ClassRegModel): - """Random Forest. - - Random forests are an ensemble learning method that operate by - constructing a multitude of decision trees at training time and - outputting the class that is the mode of the classes - (classification) or mean prediction (regression) of the individual - trees. Random forests correct for decision trees' habit of - overfitting to their training set. - - Corresponding estimators are: - - - [RandomForestClassifier][] for classification tasks. - - [RandomForestRegressor][] for regression tasks. - - Read more in sklearn's [documentation][adabdocs]. - - !!! warning - cuML's implementation of [RandomForestClassifier][cumlrf] only - supports predictions on dtype `float32`. Convert all dtypes - before calling atom's [run][atomclassifier-run] method to avoid - exceptions. - - See Also - -------- - atom.models:DecisionTree - atom.models:ExtraTrees - atom.models:HistGradientBoosting - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="RF", metric="f1", verbose=2) - ``` - - """ - - acronym = "RF" - needs_scaling = False - accepts_sparse = True - native_multilabel = True - native_multioutput = True - has_validation = None - supports_engines = ["sklearn", "sklearnex", "cuml"] - - _module = "ensemble" - _estimators = CustomDict( - {"class": "RandomForestClassifier", "reg": "RandomForestRegressor"} - ) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if not self._get_param("bootstrap", params): - params.pop("max_samples") - - return params - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - if self.goal == "class": - criterion = ["gini", "entropy"] - else: - if self.engine["estimator"] == "cuml": - criterion = ["mse", "poisson", "gamma", "inverse_gaussian"] - else: - criterion = ["squared_error", "absolute_error", "poisson"] - - dist = CustomDict( - n_estimators=Int(10, 500, step=10), - criterion=Cat(criterion), - max_depth=Cat([None, *range(1, 17)]), - min_samples_split=Int(2, 20), - min_samples_leaf=Int(1, 20), - max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]), - bootstrap=Cat([True, False]), - max_samples=Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]), - ccp_alpha=Float(0, 0.035, step=0.005), - ) - - if self.engine["estimator"] == "sklearnex": - dist.pop("criterion") - dist.pop("ccp_alpha") - elif self.engine["estimator"] == "cuml": - dist.replace_key("criterion", "split_criterion") - dist["max_depth"] = Int(1, 17) - dist["max_features"] = Cat(["sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]) - dist["max_samples"] = Float(0.5, 0.9, step=0.1) - dist.pop("ccp_alpha") - - return dist - - -class Ridge(ClassRegModel): - """Linear least squares with l2 regularization. - - If classifier, it first converts the target values into {-1, 1} - and then treats the problem as a regression task. - - Corresponding estimators are: - - - [RidgeClassifier][] for classification tasks. - - [Ridge][ridgeregressor] for regression tasks. - - Read more in sklearn's [documentation][ridgedocs]. - - !!! warning - Engines `sklearnex` and `cuml` are only available for regression - tasks. - - See Also - -------- - atom.models:BayesianRidge - atom.models:ElasticNet - atom.models:Lasso - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import fetch_california_housing - - X, y = fetch_california_housing(return_X_y=True) - - atom = ATOMRegressor(X, y, random_state=1) - atom.run(models="Ridge", metric="r2", verbose=2) - ``` - - """ - - acronym = "Ridge" - needs_scaling = True - accepts_sparse = True - native_multilabel = True - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "sklearnex", "cuml"] - - _module = "linear_model" - _estimators = CustomDict({"class": "RidgeClassifier", "reg": "Ridge"}) - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - dist = CustomDict( - alpha=Float(1e-3, 10, log=True), - solver=Cat(["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]), - ) - - if self.goal == "reg": - if self.engine["estimator"] == "sklearnex": - dist.pop("solver") # Only supports 'auto' - elif self.engine["estimator"] == "cuml": - dist["solver"] = Cat(["eig", "svd", "cd"]) - - return dist - - -class StochasticGradientDescent(ClassRegModel): - """Stochastic Gradient Descent. - - Stochastic Gradient Descent is a simple yet very efficient approach - to fitting linear classifiers and regressors under convex loss - functions. Even though SGD has been around in the machine learning - community for a long time, it has received a considerable amount of - attention just recently in the context of large-scale learning. - - Corresponding estimators are: - - - [SGDClassifier][] for classification tasks. - - [SGDRegressor][] for regression tasks. - - Read more in sklearn's [documentation][sgddocs]. - - See Also - -------- - atom.models:MultiLayerPerceptron - atom.models:PassiveAggressive - atom.models:SupportVectorMachine - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="SGD", metric="f1", verbose=2) - ``` - - """ - - acronym = "SGD" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = "max_iter" - supports_engines = ["sklearn"] - - _module = "linear_model" - _estimators = CustomDict({"class": "SGDClassifier", "reg": "SGDRegressor"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if self._get_param("penalty", params) != "elasticnet": - params.pop("l1_ratio") - - if self._get_param("learning_rate", params) == "optimal": - params.pop("eta0") - - return params - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - loss = [ - "hinge", - "log_loss", - "modified_huber", - "squared_hinge", - "perceptron", - "squared_error", - "huber", - "epsilon_insensitive", - "squared_epsilon_insensitive", - ] - - return CustomDict( - loss=Cat(loss if self.goal == "class" else loss[-4:]), - penalty=Cat([None, "l1", "l2", "elasticnet"]), - alpha=Float(1e-4, 1.0, log=True), - l1_ratio=Float(0.1, 0.9, step=0.1), - max_iter=Int(500, 1500, step=50), - epsilon=Float(1e-4, 1.0, log=True), - learning_rate=Cat(["constant", "invscaling", "optimal", "adaptive"]), - eta0=Float(1e-2, 10, log=True), - power_t=Float(0.1, 0.9, step=0.1), - average=Cat([True, False]), - ) - - -class SupportVectorMachine(ClassRegModel): - """Support Vector Machine. - - The implementation of the Support Vector Machine is based on libsvm. - The fit time scales at least quadratically with the number of - samples and may be impractical beyond tens of thousands of samples. - For large datasets consider using a [LinearSVM][] or a - [StochasticGradientDescent][] model instead. - - Corresponding estimators are: - - - [SVC][] for classification tasks. - - [SVR][] for classification tasks. - - Read more in sklearn's [documentation][svmdocs]. - - See Also - -------- - atom.models:LinearSVM - atom.models:MultiLayerPerceptron - atom.models:StochasticGradientDescent - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="SVM", metric="f1", verbose=2) - ``` - - """ - - acronym = "SVM" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = None - supports_engines = ["sklearn", "sklearnex", "cuml"] - - _module = "svm" - _estimators = CustomDict({"class": "SVC", "reg": "SVR"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if self.goal == "class": - params.pop("epsilon") - - kernel = self._get_param("kernel", params) - if kernel == "poly": - params.replace_value("gamma", "scale") # Crashes in combination with "auto" - else: - params.pop("degree") - - if kernel not in ("rbf", "poly", "sigmoid"): - params.pop("gamma") - - if kernel not in ("poly", "sigmoid"): - params.pop("coef0") - - return params - - def _get_est(self, **params) -> PREDICTOR: - """Get the model's estimator with unpacked parameters. - - Returns - ------- - Predictor - Estimator instance. - - """ - if self.engine["estimator"] == "cuml" and self.goal == "class": - return self._est_class( - probability=params.pop("probability", True), - random_state=params.pop("random_state", self.random_state), - **params) - else: - return super()._get_est(**params) - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - dist = CustomDict( - C=Float(1e-3, 100, log=True), - kernel=Cat(["linear", "poly", "rbf", "sigmoid"]), - degree=Int(2, 5), - gamma=Cat(["scale", "auto"]), - coef0=Float(-1.0, 1.0), - epsilon=Float(1e-3, 100, log=True), - shrinking=Cat([True, False]), - ) - - if self.engine["estimator"] == "cuml": - dist.pop("epsilon") - dist.pop("shrinking") - - return dist - - -class XGBoost(ClassRegModel): - """Extreme Gradient Boosting. - - XGBoost is an optimized distributed gradient boosting model - designed to be highly efficient, flexible and portable. XGBoost - provides a parallel tree boosting that solve many data science - problems in a fast and accurate way. - - Corresponding estimators are: - - - [XGBClassifier][] for classification tasks. - - [XGBRegressor][] for regression tasks. - - Read more in XGBoost's [documentation][xgbdocs]. - - See Also - -------- - atom.models:CatBoost - atom.models:GradientBoostingMachine - atom.models:LightGBM - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(models="XGB", metric="f1", verbose=2) - ``` - - """ - - acronym = "XGB" - needs_scaling = True - accepts_sparse = True - native_multilabel = False - native_multioutput = False - has_validation = "n_estimators" - supports_engines = ["xgboost"] - - _module = "xgboost" - _estimators = CustomDict({"class": "XGBClassifier", "reg": "XGBRegressor"}) - - def _get_est(self, **params) -> PREDICTOR: - """Get the model's estimator with unpacked parameters. - - Returns - ------- - Predictor - Estimator instance. - - """ - eval_metric = None - if getattr(self, "_metric", None): - eval_metric = XGBMetric(self._metric[0], task=self.task) - - return self._est_class( - eval_metric=params.pop("eval_metric", eval_metric), - n_jobs=params.pop("n_jobs", self.n_jobs), - tree_method=params.pop("tree_method", "gpu_hist" if self._gpu else None), - gpu_id=self._device_id, - verbosity=params.pop("verbosity", 0), - random_state=params.pop("random_state", self.random_state), - **params, - ) - - def _fit_estimator( - self, - estimator: PREDICTOR, - data: tuple[DATAFRAME, SERIES], - est_params_fit: dict, - validation: tuple[DATAFRAME, SERIES] | None = None, - trial: Trial | None = None, - ): - """Fit the estimator and perform in-training validation. - - Parameters - ---------- - estimator: Predictor - Instance to fit. - - data: tuple - Training data of the form (X, y). - - est_params_fit: dict - Additional parameters for the estimator's fit method. - - validation: tuple or None - Validation data of the form (X, y). If None, no validation - is performed. - - trial: [Trial][] or None - Active trial (during hyperparameter tuning). - - Returns - ------- - Predictor - Fitted instance. - - """ - m = self._metric[0].name - params = est_params_fit.copy() - - callbacks = params.pop("callbacks", []) - if trial and len(self._metric) == 1: - callbacks.append(XGBoostPruningCallback(trial, f"validation_1-{m}")) - - try: - estimator.set_params(callbacks=callbacks) - estimator.fit( - *data, - eval_set=[data, validation] if validation else None, - verbose=params.get("verbose", False), - **params, - ) - except TrialPruned as ex: - # Add the pruned step to the output - step = str(ex).split(" ")[-1][:-1] - steps = estimator.get_params()[self.has_validation] - trial.params[self.has_validation] = f"{step}/{steps}" - - trial.set_user_attr("estimator", estimator) - raise ex - - if validation: - # Create evals attribute with train and validation scores - # Negative because minimizes the function - results = estimator.evals_result() - self._evals[f"{m}_train"] = np.negative(results["validation_0"][m]) - self._evals[f"{m}_test"] = np.negative(results["validation_1"][m]) - - return estimator - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - n_estimators=Int(20, 500, step=10), - learning_rate=Float(0.01, 1.0, log=True), - max_depth=Int(1, 20), - gamma=Float(0, 1.0), - min_child_weight=Int(1, 10), - subsample=Float(0.5, 1.0, step=0.1), - colsample_bytree=Float(0.4, 1.0, step=0.1), - reg_alpha=Float(1e-4, 100, log=True), - reg_lambda=Float(1e-4, 100, log=True), - ) - - -# Time series ====================================================== >> - -class ARIMA(ForecastModel): - """Autoregressive Integrated Moving Average Model. - - Seasonal ARIMA models and exogeneous input is supported, hence this - estimator is capable of fitting SARIMA, ARIMAX, and SARIMAX. - - An ARIMA model, is a generalization of an autoregressive moving - average (ARMA) model, and is fitted to time-series data in an effort - to forecast future points. ARIMA models can be especially - efficacious in cases where data shows evidence of non-stationarity. - - The "AR" part of ARIMA indicates that the evolving variable of - interest is regressed on its own lagged (i.e., prior observed) - values. The "MA" part indicates that the regression error is - actually a linear combination of error terms whose values occurred - contemporaneously and at various times in the past. The "I" (for - "integrated") indicates that the data values have been replaced with - the difference between their values and the previous values (and this - differencing process may have been performed more than once). - - Corresponding estimators are: - - - [ARIMA][arimaclass] for forecasting tasks. - - !!! warning - ARIMA often runs into numerical errors when optimizing the - hyperparameters. Possible solutions are: - - - Use the [AutoARIMA][] model instead. - - Use [`est_params`][directforecaster-est_params] to specify the - orders manually, e.g. `#!python atom.run("arima", n_trials=5, - est_params={"order": (1, 1, 0)})`. - - Use the `catch` parameter in [`ht_params`][directforecaster-ht_params] - to avoid raising every exception, e.g. `#!python atom.run("arima", - n_trials=5, ht_params={"catch": (Exception,)})`. - - See Also - -------- - atom.models:AutoARIMA - - Examples - -------- - ```pycon - from atom import ATOMForecaster - from sktime.datasets import load_longley - - _, X = load_longley() - - atom = ATOMForecaster(X) - atom.run(models="ARIMA", verbose=2) - ``` - - """ - - acronym = "ARIMA" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None - supports_engines = ["sktime"] - - _module = "sktime.forecasting.arima" - _estimators = CustomDict({"fc": "ARIMA"}) - - _order = ("p", "d", "q") - _sorder = ("Ps", "Ds", "Qs", "S") - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - # If no seasonal periodicity, set seasonal components to zero - if self._get_param("S", params) == 0: - for p in self._sorder: - params.replace_value(p, 0) - - return params - - def _trial_to_est(self, params: CustomDict) -> CustomDict: - """Convert trial's hyperparameters to parameters for the estimator. - - Parameters - ---------- - params: CustomDict - Trial's hyperparameters. - - Returns - ------- - CustomDict - Estimator's hyperparameters. - - """ - params = super()._trial_to_est(params) - - # Convert params to hyperparameters order and seasonal_order - if all(p in params for p in self._sorder): - params.insert(0, "seasonal_order", tuple(params.pop(p) for p in self._sorder)) - if all(p in params for p in self._order): - params.insert(0, "order", tuple(params.pop(p) for p in self._order)) - - return params - - def _get_distributions(self) -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - methods = ["newton", "nm", "bfgs", "lbfgs", "powell", "cg", "ncg", "basinhopping"] - - dist = CustomDict( - p=Int(0, 2), - d=Int(0, 1), - q=Int(0, 2), - Ps=Int(0, 2), - Ds=Int(0, 1), - Qs=Int(0, 2), - S=Cat([0, 4, 6, 7, 12]), - method=Cat(methods), - maxiter=Int(50, 200, step=10), - with_intercept=Cat([True, False]), - ) - - # Drop order and seasonal_order params if specified by user - if "order" in self._est_params: - for p in self._order: - dist.pop(p) - if "seasonal_order" in self._est_params: - for p in self._sorder: - dist.pop(p) - - return dist - - -class AutoARIMA(ForecastModel): - """Automatic Autoregressive Integrated Moving Average Model. - - [ARIMA][] implementation that includes automated fitting of - (S)ARIMA(X) hyperparameters (p, d, q, P, D, Q). The AutoARIMA - algorithm seeks to identify the most optimal parameters for an - ARIMA model, settling on a single fitted ARIMA model. This process - is based on the commonly-used R function. - - AutoARIMA works by conducting differencing tests (i.e., - Kwiatkowski–Phillips–Schmidt–Shin, Augmented Dickey-Fuller or - Phillips–Perron) to determine the order of differencing, d, and - then fitting models within defined ranges. AutoARIMA also seeks - to identify the optimal P and Q hyperparameters after conducting - the Canova-Hansen to determine the optimal order of seasonal - differencing. - - Note that due to stationarity issues, AutoARIMA might not find a - suitable model that will converge. If this is the case, a ValueError - is thrown suggesting stationarity-inducing measures be taken prior - to re-fitting or that a new range of order values be selected. - - Corresponding estimators are: - - - [AutoARIMA][autoarimaclass] for forecasting tasks. - - See Also - -------- - atom.models:ARIMA - atom.models:ETS - - Examples - -------- - ```pycon - from atom import ATOMForecaster - from sktime.datasets import load_longley - - _, X = load_longley() - - atom = ATOMForecaster(X, random_state=1) - atom.run(models="autoarima", verbose=2) - ``` - - """ - - acronym = "AutoARIMA" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None - supports_engines = ["sktime"] - - _module = "sktime.forecasting.arima" - _estimators = CustomDict({"fc": "AutoARIMA"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - methods = ["newton", "nm", "bfgs", "lbfgs", "powell", "cg", "ncg", "basinhopping"] - - return CustomDict( - method=Cat(methods), - maxiter=Int(50, 200, step=10), - with_intercept=Cat([True, False]), - ) - - -class ExponentialSmoothing(ForecastModel): - """Exponential Smoothing forecaster. - - Holt-Winters exponential smoothing forecaster. The default settings - use simple exponential smoothing, without trend and seasonality - components. - - Corresponding estimators are: - - - [ExponentialSmoothing][esclass] for forecasting tasks. - - See Also - -------- - atom.models:ARIMA - atom.models:ETS - atom.models:PolynomialTrend - - Examples - -------- - ```pycon - from atom import ATOMForecaster - from sktime.datasets import load_airline - - y = load_airline() - - atom = ATOMForecaster(y, random_state=1) - atom.run(models="ES", verbose=2) - ``` - - """ - - acronym = "ES" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None - supports_engines = ["sktime"] - - _module = "sktime.forecasting.exp_smoothing" - _estimators = CustomDict({"fc": "ExponentialSmoothing"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - if self._get_param("trend", params) is None: - params.pop("damped_trend") - - if self._get_param("sp", params) is None: - params.pop("seasonal") - - return params - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - methods = ["L-BFGS-B", "TNC", "SLSQP", "Powell", "trust-constr", "bh", "ls"] - - return CustomDict( - trend=Cat(["add", "mul", None]), - damped_trend=Cat([True, False]), - seasonal=Cat(["add", "mul", None]), - sp=Cat([4, 6, 7, 12, None]), - use_boxcox=Cat([True, False]), - initialization_method=Cat(["estimated", "heuristic"]), - method=Cat(methods), - ) - - -class ETS(ForecastModel): - """ETS model with automatic fitting capabilities. - - The ETS models are a family of time series models with an - underlying state space model consisting of a level component, - a trend component (T), a seasonal component (S), and an error - term (E). - - Corresponding estimators are: - - - [AutoETS][] for forecasting tasks. - - See Also - -------- - atom.models:ARIMA - atom.models:ExponentialSmoothing - atom.models:PolynomialTrend - - Examples - -------- - ```pycon - from atom import ATOMForecaster - from sktime.datasets import load_airline - - y = load_airline() - - atom = ATOMForecaster(y, random_state=1) - atom.run(models="ETS", verbose=2) - - ``` - - """ - - acronym = "ETS" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None - supports_engines = ["sktime"] - - _module = "sktime.forecasting.ets" - _estimators = CustomDict({"fc": "AutoETS"}) - - def _get_parameters(self, trial: Trial) -> CustomDict: - """Get the trial's hyperparameters. - - Parameters - ---------- - trial: [Trial][] - Current trial. - - Returns - ------- - CustomDict - Trial's hyperparameters. - - """ - params = super()._get_parameters(trial) - - # If no seasonal periodicity, set seasonal components to zero - if self._get_param("sp", params) == 1: - params.pop("seasonal") - - return params - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - error=Cat(["add", "mul"]), - trend=Cat(["add", "mul", None]), - damped_trend=Cat([True, False]), - seasonal=Cat(["add", "mul", None]), - sp=Cat([1, 4, 6, 7, 12]), - initialization_method=Cat(["estimated", "heuristic"]), - maxiter=Int(500, 2000, step=100), - auto=Cat([True, False]), - information_criterion=Cat(["aic", "bic", "aicc"]), - ) - - -class NaiveForecaster(ForecastModel): - """Naive Forecaster. - - NaiveForecaster is a dummy forecaster that makes forecasts using - simple strategies based on naive assumptions about past trends - continuing. When used in [multivariate][] tasks, each column is - forecasted with the same strategy. - - Corresponding estimators are: - - - [NaiveForecaster][naiveforecasterclass] for forecasting tasks. - - See Also - -------- - atom.models:ExponentialSmoothing - atom.models:Dummy - atom.models:PolynomialTrend - - Examples - -------- - ```pycon - from atom import ATOMForecaster - from sktime.datasets import load_airline - - y = load_airline() - - atom = ATOMForecaster(y, random_state=1) - atom.run(models="NF", verbose=2) - - ``` - - """ - - acronym = "NF" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None - supports_engines = ["sktime"] - - _module = "sktime.forecasting.naive" - _estimators = CustomDict({"fc": "NaiveForecaster"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict(strategy=Cat(["last", "mean", "drift"])) - - -class PolynomialTrend(ForecastModel): - """Polynomial Trend forecaster. - - Forecast time series data with a polynomial trend, using a sklearn - [LinearRegression][] class to regress values of time series on - index, after extraction of polynomial features. - - Corresponding estimators are: - - - [PolynomialTrendForecaster][] for forecasting tasks. - - See Also - -------- - atom.models:ARIMA - atom.models:ETS - atom.models:NaiveForecaster - - Examples - -------- - ```pycon - from atom import ATOMForecaster - from sktime.datasets import load_airline - - y = load_airline() - - atom = ATOMForecaster(y, random_state=1) - atom.run(models="PT", verbose=2) - ``` - - """ - - acronym = "PT" - needs_scaling = False - accepts_sparse = False - native_multilabel = False - native_multioutput = True - has_validation = None - supports_engines = ["sktime"] - - _module = "sktime.forecasting.trend" - _estimators = CustomDict({"fc": "PolynomialTrendForecaster"}) - - @staticmethod - def _get_distributions() -> CustomDict: - """Get the predefined hyperparameter distributions. - - Returns - ------- - CustomDict - Hyperparameter distributions. - - """ - return CustomDict( - degree=Int(1, 5), - with_intercept=Cat([True, False]), - ) - - -# Ensembles ======================================================== >> - -class Stacking(ClassRegModel): - """Stacking ensemble. - - Parameters - ---------- - models: ClassMap - Models from which to build the ensemble. - - **kwargs - Additional keyword arguments for the estimator. - - """ - - acronym = "Stack" - needs_scaling = False - has_validation = None - native_multilabel = False - native_multioutput = False - supports_engines = [] - - _module = "atom.ensembles" - _estimators = CustomDict({"class": "StackingClassifier", "reg": "StackingRegressor"}) - - def __init__(self, models: ClassMap, **kwargs): - self._models = models - kw_model = {k: v for k, v in kwargs.items() if k in sign(ClassRegModel.__init__)} - super().__init__(**kw_model) - self._est_params = {k: v for k, v in kwargs.items() if k not in kw_model} - - def _get_est(self, **params) -> PREDICTOR: - """Get the model's estimator with unpacked parameters. - - Returns - ------- - Predictor - Estimator instance. - - """ - estimators = [] - for m in self._models: - if m.scaler: - name = f"pipeline_{m.name}" - est = Pipeline([("scaler", m.scaler), (m.name, m.estimator)]) - else: - name = m.name - est = m.estimator - - estimators.append((name, est)) - - return self._est_class( - estimators=estimators, - n_jobs=params.pop("n_jobs", self.n_jobs), - **params, - ) - - -class Voting(ClassRegModel): - """Voting ensemble. - - Parameters - ---------- - models: ClassMap - Models from which to build the ensemble. - - **kwargs - Additional keyword arguments for the estimator. - - """ - - acronym = "Vote" - needs_scaling = False - has_validation = None - native_multilabel = False - native_multioutput = False - supports_engines = [] - - _module = "atom.ensembles" - _estimators = CustomDict({"class": "VotingClassifier", "reg": "VotingRegressor"}) - - def __init__(self, models: ClassMap, **kwargs): - self._models = models - kw_model = {k: v for k, v in kwargs.items() if k in sign(ClassRegModel.__init__)} - super().__init__(**kw_model) - self._est_params = {k: v for k, v in kwargs.items() if k not in kw_model} - - if self._est_params.get("voting") == "soft": - for m in self._models: - if not hasattr(m.estimator, "predict_proba"): - raise ValueError( - "Invalid value for the voting parameter. If " - "'soft', all models in the ensemble should have " - f"a predict_proba method, got {m._fullname}." - ) - - def _get_est(self, **params) -> PREDICTOR: - """Get the model's estimator with unpacked parameters. - - Returns - ------- - Predictor - Estimator instance. - - """ - estimators = [] - for m in self._models: - if m.scaler: - name = f"pipeline_{m.name}" - est = Pipeline([("scaler", m.scaler), (m.name, m.estimator)]) - else: - name = m.name - est = m.estimator - - estimators.append((name, est)) - - return self._est_class( - estimators=estimators, - n_jobs=params.pop("n_jobs", self.n_jobs), - **params, - ) - - -# Variables ======================================================== >> - -# Available models -MODELS = ClassMap( - AdaBoost, - ARIMA, - AutoARIMA, - AutomaticRelevanceDetermination, - Bagging, - BayesianRidge, - BernoulliNB, - CatBoost, - CategoricalNB, - ComplementNB, - DecisionTree, - Dummy, - ElasticNet, - ETS, - ExponentialSmoothing, - ExtraTree, - ExtraTrees, - GaussianNB, - GaussianProcess, - GradientBoostingMachine, - HuberRegression, - HistGradientBoosting, - KNearestNeighbors, - Lasso, - LeastAngleRegression, - LightGBM, - LinearDiscriminantAnalysis, - LinearSVM, - LogisticRegression, - MultiLayerPerceptron, - MultinomialNB, - NaiveForecaster, - OrdinaryLeastSquares, - OrthogonalMatchingPursuit, - PassiveAggressive, - Perceptron, - PolynomialTrend, - QuadraticDiscriminantAnalysis, - RadiusNearestNeighbors, - RandomForest, - Ridge, - StochasticGradientDescent, - SupportVectorMachine, - XGBoost, - key="acronym", -) - -# Available ensembles -ENSEMBLES = ClassMap(Stacking, Voting, key="acronym") - -# Available models + ensembles -MODELS_ENSEMBLES = ClassMap(*MODELS, *ENSEMBLES, key="acronym") +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module containing classification and regression models. + +""" + +from __future__ import annotations + +import numpy as np +from optuna.distributions import CategoricalDistribution as Cat +from optuna.distributions import FloatDistribution as Float +from optuna.distributions import IntDistribution as Int +from optuna.exceptions import TrialPruned +from optuna.integration import ( + CatBoostPruningCallback, LightGBMPruningCallback, XGBoostPruningCallback, +) +from optuna.trial import Trial + +from atom.basemodel import ClassRegModel +from atom.utils.types import DATAFRAME, PANDAS, PREDICTOR +from atom.utils.utils import CatBMetric, CustomDict, LGBMetric, XGBMetric + + +class AdaBoost(ClassRegModel): + """Adaptive Boosting (with decision tree as base estimator). + + AdaBoost is a meta-estimator that begins by fitting a + classifier/regressor on the original dataset and then fits + additional copies of the algorithm on the same dataset but where + the weights of instances are adjusted according to the error of + the current prediction. + + Corresponding estimators are: + + - [AdaBoostClassifier][] for classification tasks. + - [AdaBoostRegressor][] for regression tasks. + + Read more in sklearn's [documentation][adabdocs]. + + See Also + -------- + atom.models:GradientBoostingMachine + atom.models:RandomForest + atom.models:XGBoost + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="AdaB", metric="f1", verbose=2) + ``` + + """ + + acronym = "AdaB" + needs_scaling = False + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "ensemble" + _estimators = CustomDict({"class": "AdaBoostClassifier", "reg": "AdaBoostRegressor"}) + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + dist = CustomDict( + n_estimators=Int(50, 500, step=10), + learning_rate=Float(0.01, 10, log=True), + ) + + if self.goal == "class": + dist["algorithm"] = Cat(["SAMME.R", "SAMME"]) + else: + dist["loss"] = Cat(["linear", "square", "exponential"]) + + return dist + + +class AutomaticRelevanceDetermination(ClassRegModel): + """Automatic Relevance Determination. + + Automatic Relevance Determination is very similar to + [BayesianRidge][], but can lead to sparser coefficients. Fit the + weights of a regression model, using an ARD prior. The weights of + the regression model are assumed to be in Gaussian distributions. + + Corresponding estimators are: + + - [ARDRegression][] for regression tasks. + + Read more in sklearn's [documentation][arddocs]. + + See Also + -------- + atom.models:BayesianRidge + atom.models:GaussianProcess + atom.models:LeastAngleRegression + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import fetch_california_housing + + X, y = fetch_california_housing(return_X_y=True) + + atom = ATOMRegressor(X, y, random_state=1) + atom.run(models="ARD", metric="r2", verbose=2) + ``` + + """ + + acronym = "ARD" + needs_scaling = True + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "linear_model" + _estimators = CustomDict({"reg": "ARDRegression"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + n_iter=Int(100, 1000, step=10), + alpha_1=Float(1e-4, 1, log=True), + alpha_2=Float(1e-4, 1, log=True), + lambda_1=Float(1e-4, 1, log=True), + lambda_2=Float(1e-4, 1, log=True), + ) + + +class Bagging(ClassRegModel): + """Bagging model (with decision tree as base estimator). + + Bagging uses an ensemble meta-estimator that fits base predictors + on random subsets of the original dataset and then aggregate their + individual predictions (either by voting or by averaging) to form a + final prediction. Such a meta-estimator can typically be used as a + way to reduce the variance of a black-box estimator by introducing + randomization into its construction procedure and then making an + ensemble out of it. + + Corresponding estimators are: + + - [BaggingClassifier][] for classification tasks. + - [BaggingRegressor][] for regression tasks. + + Read more in sklearn's [documentation][bagdocs]. + + See Also + -------- + atom.models:DecisionTree + atom.models:LogisticRegression + atom.models:RandomForest + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="Bag", metric="f1", verbose=2) + ``` + + """ + + acronym = "Bag" + needs_scaling = False + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "ensemble" + _estimators = CustomDict({"class": "BaggingClassifier", "reg": "BaggingRegressor"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + n_estimators=Int(10, 500, step=10), + max_samples=Float(0.5, 1.0, step=0.1), + max_features=Float(0.5, 1.0, step=0.1), + bootstrap=Cat([True, False]), + bootstrap_features=Cat([True, False]), + ) + + +class BayesianRidge(ClassRegModel): + """Bayesian ridge regression. + + Bayesian regression techniques can be used to include regularization + parameters in the estimation procedure: the regularization parameter + is not set in a hard sense but tuned to the data at hand. + + Corresponding estimators are: + + - [BayesianRidge][bayesianridgeclass] for regression tasks. + + Read more in sklearn's [documentation][brdocs]. + + See Also + -------- + atom.models:AutomaticRelevanceDetermination + atom.models:GaussianProcess + atom.models:LeastAngleRegression + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import fetch_california_housing + + X, y = fetch_california_housing(return_X_y=True) + + atom = ATOMRegressor(X, y, random_state=1) + atom.run(models="BR", metric="r2", verbose=2) + ``` + + """ + + acronym = "BR" + needs_scaling = True + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "linear_model" + _estimators = CustomDict({"reg": "BayesianRidge"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + n_iter=Int(100, 1000, step=10), + alpha_1=Float(1e-4, 1, log=True), + alpha_2=Float(1e-4, 1, log=True), + lambda_1=Float(1e-4, 1, log=True), + lambda_2=Float(1e-4, 1, log=True), + ) + + +class BernoulliNB(ClassRegModel): + """Bernoulli Naive Bayes. + + BernoulliNB implements the Naive Bayes algorithm for multivariate + Bernoulli models. Like [MultinomialNB][], this classifier is + suitable for discrete data. The difference is that while MNB works + with occurrence counts, BNB is designed for binary/boolean features. + + Corresponding estimators are: + + - [BernoulliNB][bernoullinbclass] for classification tasks. + + Read more in sklearn's [documentation][bnbdocs]. + + See Also + -------- + atom.models:ComplementNB + atom.models:CategoricalNB + atom.models:MultinomialNB + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="BNB", metric="f1", verbose=2) + ``` + + """ + + acronym = "BNB" + needs_scaling = False + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "cuml"] + + _module = "naive_bayes" + _estimators = CustomDict({"class": "BernoulliNB"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + alpha=Float(0.01, 10, log=True), + fit_prior=Cat([True, False]), + ) + + +class CatBoost(ClassRegModel): + """Cat Boosting Machine. + + CatBoost is a machine learning method based on gradient boosting + over decision trees. Main advantages of CatBoost: + + - Superior quality when compared with other GBDT models on many + datasets. + - Best in class prediction speed. + + Corresponding estimators are: + + - [CatBoostClassifier][] for classification tasks. + - [CatBoostRegressor][] for regression tasks. + + Read more in CatBoost's [documentation][catbdocs]. + + !!! warning + * CatBoost selects the weights achieved by the best evaluation + on the test set after training. This means that, by default, + there is some minor data leakage in the test set. Use the + `use_best_model=False` parameter to avoid this behavior or use + a [holdout set][data-sets] to evaluate the final estimator. + * [In-training validation][] and [pruning][] are disabled when + `#!python device="gpu"`. + + !!! note + ATOM uses CatBoost's `n_estimators` parameter instead of + `iterations` to indicate the number of trees to fit. This is + done to have consistent naming with the [XGBoost][] and + [LightGBM][] models. + + See Also + -------- + atom.models:GradientBoostingMachine + atom.models:LightGBM + atom.models:XGBoost + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="CatB", metric="f1", verbose=2) + ``` + + """ + + acronym = "CatB" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = "n_estimators" + supports_engines = ["catboost"] + + _module = "catboost" + _estimators = CustomDict({"class": "CatBoostClassifier", "reg": "CatBoostRegressor"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if self._get_param("bootstrap_type", params) == "Bernoulli": + params.pop("bagging_temperature") + elif self._get_param("bootstrap_type", params) == "Bayesian": + params.pop("subsample") + + return params + + def _get_est(self, **params) -> PREDICTOR: + """Get the estimator instance. + + Parameters + ---------- + **params + Unpacked hyperparameters for the estimator. + + Returns + ------- + Predictor + Estimator instance. + + """ + eval_metric = None + if getattr(self, "_metric", None) and not self._gpu: + eval_metric = CatBMetric(self._metric[0], task=self.task) + + return self._est_class( + eval_metric=params.pop("eval_metric", eval_metric), + train_dir=params.pop("train_dir", ""), + allow_writing_files=params.pop("allow_writing_files", False), + thread_count=params.pop("n_jobs", self.n_jobs), + task_type=params.pop("task_type", "GPU" if self._gpu else "CPU"), + devices=str(self._device_id), + verbose=params.pop("verbose", False), + random_state=params.pop("random_state", self.random_state), + **params, + ) + + def _fit_estimator( + self, + estimator: PREDICTOR, + data: tuple[DATAFRAME, PANDAS], + est_params_fit: dict, + validation: tuple[DATAFRAME, PANDAS] | None = None, + trial: Trial | None = None, + ): + """Fit the estimator and perform in-training validation. + + Parameters + ---------- + estimator: Predictor + Instance to fit. + + data: tuple + Training data of the form (X, y). + + est_params_fit: dict + Additional parameters for the estimator's fit method. + + validation: tuple or None + Validation data of the form (X, y). If None, no validation + is performed. + + trial: [Trial][] or None + Active trial (during hyperparameter tuning). + + Returns + ------- + Predictor + Fitted instance. + + """ + params = est_params_fit.copy() + + callbacks = params.pop("callbacks", []) + if trial and len(self._metric) == 1 and not self._gpu: + callbacks.append(cb := CatBoostPruningCallback(trial, "CatBMetric")) + + # gpu implementation fails if callbacks!=None + estimator.fit(*data, eval_set=validation, callbacks=callbacks or None, **params) + + if not self._gpu: + if validation: + # Create evals attribute with train and validation scores + m = self._metric[0].name + evals = estimator.evals_result_ + self._evals[f"{m}_train"] = evals["learn"]["CatBMetric"] + self._evals[f"{m}_test"] = evals["validation"]["CatBMetric"] + + if trial and len(self._metric) == 1 and cb._pruned: + # Add the pruned step to the output + step = len(self.evals[f'{m}_train']) + steps = estimator.get_params()[self.has_validation] + trial.params[self.has_validation] = f"{step}/{steps}" + + trial.set_user_attr("estimator", estimator) + raise TrialPruned(cb._message) + + return estimator + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + n_estimators=Int(20, 500, step=10), + learning_rate=Float(0.01, 1.0, log=True), + max_depth=Cat([None, *range(1, 17)]), + min_child_samples=Int(1, 30), + bootstrap_type=Cat(["Bayesian", "Bernoulli"]), + bagging_temperature=Float(0, 10), + subsample=Float(0.5, 1.0, step=0.1), + reg_lambda=Float(0.001, 100, log=True), + ) + + +class CategoricalNB(ClassRegModel): + """Categorical Naive Bayes. + + Categorical Naive Bayes implements the Naive Bayes algorithm for + categorical features. + + Corresponding estimators are: + + - [CategoricalNB][categoricalnbclass] for classification tasks. + + Read more in sklearn's [documentation][catnbdocs]. + + See Also + -------- + atom.models:BernoulliNB + atom.models:ComplementNB + atom.models:GaussianNB + + Examples + -------- + ```pycon + from atom import ATOMClassifier + import numpy as np + + X = np.random.randint(5, size=(100, 100)) + y = np.random.randint(2, size=100) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="CatNB", metric="f1", verbose=2) + ``` + + """ + + acronym = "CatNB" + needs_scaling = False + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "cuml"] + + _module = "naive_bayes" + _estimators = CustomDict({"class": "CategoricalNB"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + alpha=Float(0.01, 10, log=True), + fit_prior=Cat([True, False]), + ) + + +class ComplementNB(ClassRegModel): + """Complement Naive Bayes. + + The Complement Naive Bayes classifier was designed to correct the + "severe assumptions" made by the standard [MultinomialNB][] + classifier. It is particularly suited for imbalanced datasets. + + Corresponding estimators are: + + - [ComplementNB][complementnbclass] for classification tasks. + + Read more in sklearn's [documentation][cnbdocs]. + + See Also + -------- + atom.models:BernoulliNB + atom.models:CategoricalNB + atom.models:MultinomialNB + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="CNB", metric="f1", verbose=2) + ``` + + """ + + acronym = "CNB" + needs_scaling = False + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "cuml"] + + _module = "naive_bayes" + _estimators = CustomDict({"class": "ComplementNB"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + alpha=Float(0.01, 10, log=True), + fit_prior=Cat([True, False]), + norm=Cat([True, False]), + ) + + +class DecisionTree(ClassRegModel): + """Single Decision Tree. + + A single decision tree classifier/regressor. + + Corresponding estimators are: + + - [DecisionTreeClassifier][] for classification tasks. + - [DecisionTreeRegressor][] for regression tasks. + + Read more in sklearn's [documentation][treedocs]. + + See Also + -------- + atom.models:ExtraTree + atom.models:ExtraTrees + atom.models:RandomForest + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="Tree", metric="f1", verbose=2) + ``` + + """ + + acronym = "Tree" + needs_scaling = False + accepts_sparse = True + native_multilabel = True + native_multioutput = True + has_validation = None + supports_engines = ["sklearn"] + + _module = "tree" + _estimators = CustomDict( + {"class": "DecisionTreeClassifier", "reg": "DecisionTreeRegressor"} + ) + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + if self.goal == "class": + criterion = ["gini", "entropy"] + else: + criterion = ["squared_error", "absolute_error", "friedman_mse", "poisson"] + + return CustomDict( + criterion=Cat(criterion), + splitter=Cat(["best", "random"]), + max_depth=Cat([None, *range(1, 17)]), + min_samples_split=Int(2, 20), + min_samples_leaf=Int(1, 20), + max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]), + ccp_alpha=Float(0, 0.035, step=0.005), + ) + + +class Dummy(ClassRegModel): + """Dummy classifier/regressor. + + When doing supervised learning, a simple sanity check consists of + comparing one's estimator against simple rules of thumb. The + prediction methods completely ignore the input data. Do not use + this model for real problems. Use it only as a simple baseline + to compare with other models. + + Corresponding estimators are: + + - [DummyClassifier][] for classification tasks. + - [DummyRegressor][] for regression tasks. + + Read more in sklearn's [documentation][dummydocs]. + + See Also + -------- + atom.models:DecisionTree + atom.models:ExtraTree + atom.models:NaiveForecaster + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="Dummy", metric="f1", verbose=2) + ``` + + """ + + acronym = "Dummy" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "dummy" + _estimators = CustomDict({"class": "DummyClassifier", "reg": "DummyRegressor"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if self._get_param("strategy", params) != "quantile": + params.pop("quantile") + + return params + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + dist = CustomDict() + if self.goal == "class": + dist["strategy"] = Cat(["most_frequent", "prior", "stratified", "uniform"]) + else: + dist["strategy"] = Cat(["mean", "median", "quantile"]) + dist["quantile"] = Float(0, 1.0, step=0.1) + + return dist + + +class ElasticNet(ClassRegModel): + """Linear Regression with elasticnet regularization. + + Linear least squares with l1 and l2 regularization. + + Corresponding estimators are: + + - [ElasticNet][elasticnetreg] for regression tasks. + + Read more in sklearn's [documentation][endocs]. + + See Also + -------- + atom.models:Lasso + atom.models:OrdinaryLeastSquares + atom.models:Ridge + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import fetch_california_housing + + X, y = fetch_california_housing(return_X_y=True) + + atom = ATOMRegressor(X, y, random_state=1) + atom.run(models="EN", metric="r2", verbose=2) + ``` + + """ + + acronym = "EN" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "sklearnex", "cuml"] + + _module = "linear_model" + _estimators = CustomDict({"reg": "ElasticNet"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + alpha=Float(1e-3, 10, log=True), + l1_ratio=Float(0.1, 0.9, step=0.1), + selection=Cat(["cyclic", "random"]), + ) + + +class ExtraTree(ClassRegModel): + """Extremely Randomized Tree. + + Extra-trees differ from classic decision trees in the way they are + built. When looking for the best split to separate the samples of a + node into two groups, random splits are drawn for each of the + max_features randomly selected features and the best split among + those is chosen. When max_features is set 1, this amounts to + building a totally random decision tree. + + Corresponding estimators are: + + - [ExtraTreeClassifier][] for classification tasks. + - [ExtraTreeRegressor][] for regression tasks. + + Read more in sklearn's [documentation][treedocs]. + + See Also + -------- + atom.models:DecisionTree + atom.models:ExtraTrees + atom.models:RandomForest + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="ETree", metric="f1", verbose=2) + ``` + + """ + + acronym = "ETree" + needs_scaling = False + accepts_sparse = True + native_multilabel = True + native_multioutput = True + has_validation = None + supports_engines = ["sklearn"] + + _module = "tree" + _estimators = CustomDict( + {"class": "ExtraTreeClassifier", "reg": "ExtraTreeRegressor"} + ) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if not self._get_param("bootstrap", params): + params.pop("max_samples") + + return params + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + if self.goal == "class": + criterion = ["gini", "entropy"] + else: + criterion = ["squared_error", "absolute_error"] + + return CustomDict( + criterion=Cat(criterion), + splitter=Cat(["random", "best"]), + max_depth=Cat([None, *range(1, 17)]), + min_samples_split=Int(2, 20), + min_samples_leaf=Int(1, 20), + max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]), + ccp_alpha=Float(0, 0.035, step=0.005), + ) + + +class ExtraTrees(ClassRegModel): + """Extremely Randomized Trees. + + Extra-Trees use a meta estimator that fits a number of randomized + decision trees (a.k.a. [extra-trees][extratree]) on various + sub-samples of the dataset and uses averaging to improve the + predictive accuracy and control over-fitting. + + Corresponding estimators are: + + - [ExtraTreesClassifier][] for classification tasks. + - [ExtraTreesRegressor][] for regression tasks. + + Read more in sklearn's [documentation][etdocs]. + + See Also + -------- + atom.models:DecisionTree + atom.models:ExtraTree + atom.models:RandomForest + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="ET", metric="f1", verbose=2) + ``` + + """ + + acronym = "ET" + needs_scaling = False + accepts_sparse = True + native_multilabel = True + native_multioutput = True + has_validation = None + supports_engines = ["sklearn"] + + _module = "ensemble" + _estimators = CustomDict( + {"class": "ExtraTreesClassifier", "reg": "ExtraTreesRegressor"} + ) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if not self._get_param("bootstrap", params): + params.pop("max_samples") + + return params + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + if self.goal == "class": + criterion = ["gini", "entropy"] + else: + criterion = ["squared_error", "absolute_error"] + + return CustomDict( + n_estimators=Int(10, 500, step=10), + criterion=Cat(criterion), + max_depth=Cat([None, *range(1, 17)]), + min_samples_split=Int(2, 20), + min_samples_leaf=Int(1, 20), + max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]), + bootstrap=Cat([True, False]), + max_samples=Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]), + ccp_alpha=Float(0, 0.035, step=0.005), + ) + + +class GaussianNB(ClassRegModel): + """Gaussian Naive Bayes. + + Gaussian Naive Bayes implements the Naive Bayes algorithm for + classification. The likelihood of the features is assumed to + be Gaussian. + + Corresponding estimators are: + + - [GaussianNB][gaussiannbclass] for classification tasks. + + Read more in sklearn's [documentation][gnbdocs]. + + See Also + -------- + atom.models:BernoulliNB + atom.models:CategoricalNB + atom.models:ComplementNB + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="GNB", metric="f1", verbose=2) + ``` + + """ + + acronym = "GNB" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "cuml"] + + _module = "naive_bayes" + _estimators = CustomDict({"class": "GaussianNB"}) + + +class GaussianProcess(ClassRegModel): + """Gaussian process. + + Gaussian Processes are a generic supervised learning method + designed to solve regression and probabilistic classification + problems. The advantages of Gaussian processes are: + + * The prediction interpolates the observations. + * The prediction is probabilistic (Gaussian) so that one can compute + empirical confidence intervals and decide based on those if one + should refit (online fitting, adaptive fitting) the prediction in + some region of interest. + + The disadvantages of Gaussian processes include: + + * They are not sparse, i.e. they use the whole samples/features + information to perform the prediction. + * They lose efficiency in high dimensional spaces, namely when the + number of features exceeds a few dozens. + + Corresponding estimators are: + + - [GaussianProcessClassifier][] for classification tasks. + - [GaussianProcessRegressor][] for regression tasks. + + Read more in sklearn's [documentation][gpdocs]. + + See Also + -------- + atom.models:GaussianNB + atom.models:LinearDiscriminantAnalysis + atom.models:PassiveAggressive + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="GP", metric="f1", verbose=2) + ``` + + """ + + acronym = "GP" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "gaussian_process" + _estimators = CustomDict( + {"class": "GaussianProcessClassifier", "reg": "GaussianProcessRegressor"} + ) + + +class GradientBoostingMachine(ClassRegModel): + """Gradient Boosting Machine. + + A Gradient Boosting Machine builds an additive model in a forward + stage-wise fashion; it allows for the optimization of arbitrary + differentiable loss functions. In each stage `n_classes_` regression + trees are fit on the negative gradient of the loss function, e.g. + binary or multiclass log loss. Binary classification is a special + case where only a single regression tree is induced. + + Corresponding estimators are: + + - [GradientBoostingClassifier][] for classification tasks. + - [GradientBoostingRegressor][] for regression tasks. + + Read more in sklearn's [documentation][gbmdocs]. + + !!! tip + [HistGradientBoosting][] is a much faster variant of this + algorithm for intermediate datasets (n_samples >= 10k). + + See Also + -------- + atom.models:CatBoost + atom.models:HistGradientBoosting + atom.models:LightGBM + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="GBM", metric="f1", verbose=2) + ``` + + """ + + acronym = "GBM" + needs_scaling = False + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "ensemble" + _estimators = CustomDict( + {"class": "GradientBoostingClassifier", "reg": "GradientBoostingRegressor"} + ) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if self._get_param("loss", params) not in ("huber", "quantile"): + params.pop("alpha") + + return params + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + dist = CustomDict( + loss=Cat(["log_loss", "exponential"]), + learning_rate=Float(0.01, 1.0, log=True), + n_estimators=Int(10, 500, step=10), + subsample=Float(0.5, 1.0, step=0.1), + criterion=Cat(["friedman_mse", "squared_error"]), + min_samples_split=Int(2, 20), + min_samples_leaf=Int(1, 20), + max_depth=Int(1, 21), + max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]), + ccp_alpha=Float(0, 0.035, step=0.005), + ) + + if self.task.startswith("multiclass"): + dist.pop("loss") # Multiclass only supports log_loss + elif self.goal.startswith("reg"): + dist["loss"] = Cat(["squared_error", "absolute_error", "huber", "quantile"]) + dist["alpha"] = Float(0.1, 0.9, step=0.1) + + return dist + + +class HuberRegression(ClassRegModel): + """Huber regressor. + + Huber is a linear regression model that is robust to outliers. It + makes sure that the loss function is not heavily influenced by the + outliers while not completely ignoring their effect. + + Corresponding estimators are: + + - [HuberRegressor][] for regression tasks. + + Read more in sklearn's [documentation][huberdocs]. + + See Also + -------- + atom.models:AutomaticRelevanceDetermination + atom.models:LeastAngleRegression + atom.models:OrdinaryLeastSquares + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import fetch_california_housing + + X, y = fetch_california_housing(return_X_y=True) + + atom = ATOMRegressor(X, y, random_state=1) + atom.run(models="Huber", metric="r2", verbose=2) + ``` + + """ + + acronym = "Huber" + needs_scaling = True + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "linear_model" + _estimators = CustomDict({"reg": "HuberRegressor"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + epsilon=Float(1, 10, log=True), + max_iter=Int(50, 500, step=10), + alpha=Float(1e-4, 1, log=True), + ) + + +class HistGradientBoosting(ClassRegModel): + """Histogram-based Gradient Boosting Machine. + + This Histogram-based Gradient Boosting Machine is much faster than + the standard [GradientBoostingMachine][] for big datasets + (n_samples>=10k). This variation first bins the input samples into + integer-valued bins which tremendously reduces the number of + splitting points to consider, and allows the algorithm to leverage + integer-based data structures (histograms) instead of relying on + sorted continuous values when building the trees. + + Corresponding estimators are: + + - [HistGradientBoostingClassifier][] for classification tasks. + - [HistGradientBoostingRegressor][] for regression tasks. + + Read more in sklearn's [documentation][hgbmdocs]. + + See Also + -------- + atom.models:CatBoost + atom.models:GradientBoostingMachine + atom.models:XGBoost + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="hGBM", metric="f1", verbose=2) + ``` + + """ + + acronym = "hGBM" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "ensemble" + _estimators = CustomDict( + { + "class": "HistGradientBoostingClassifier", + "reg": "HistGradientBoostingRegressor", + } + ) + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + dist = CustomDict( + loss=Cat(["squared_error", "absolute_error", "poisson", "quantile", "gamma"]), + learning_rate=Float(0.01, 1.0, log=True), + max_iter=Int(10, 500, step=10), + max_leaf_nodes=Int(10, 50), + max_depth=Cat([None, *range(1, 17)]), + min_samples_leaf=Int(10, 30), + l2_regularization=Float(0, 1.0, step=0.1), + ) + + if self.goal == "class": + dist.pop("loss") + + return dist + + +class KNearestNeighbors(ClassRegModel): + """K-Nearest Neighbors. + + K-Nearest Neighbors, as the name clearly indicates, implements the + k-nearest neighbors vote. For regression, the target is predicted + by local interpolation of the targets associated of the nearest + neighbors in the training set. + + Corresponding estimators are: + + - [KNeighborsClassifier][] for classification tasks. + - [KNeighborsRegressor][] for classification tasks. + + Read more in sklearn's [documentation][knndocs]. + + See Also + -------- + atom.models:LinearDiscriminantAnalysis + atom.models:QuadraticDiscriminantAnalysis + atom.models:RadiusNearestNeighbors + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="KNN", metric="f1", verbose=2) + ``` + + """ + + acronym = "KNN" + needs_scaling = True + accepts_sparse = True + native_multilabel = True + native_multioutput = True + has_validation = None + supports_engines = ["sklearn", "sklearnex", "cuml"] + + _module = "neighbors" + _estimators = CustomDict( + {"class": "KNeighborsClassifier", "reg": "KNeighborsRegressor"} + ) + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + dist = CustomDict( + n_neighbors=Int(1, 100), + weights=Cat(["uniform", "distance"]), + algorithm=Cat(["auto", "ball_tree", "kd_tree", "brute"]), + leaf_size=Int(20, 40), + p=Int(1, 2), + ) + + if self._gpu: + dist.pop("algorithm") # Only 'brute' is supported + if self.engine.get("estimator") == "cuml": + dist.pop("weights") # Only 'uniform' is supported + dist.pop("leaf_size") + dist.pop("p") + + return dist + + +class Lasso(ClassRegModel): + """Linear Regression with lasso regularization. + + Linear least squares with l1 regularization. + + Corresponding estimators are: + + - [Lasso][lassoreg] for regression tasks. + + Read more in sklearn's [documentation][lassodocs]. + + See Also + -------- + atom.models:ElasticNet + atom.models:OrdinaryLeastSquares + atom.models:Ridge + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import fetch_california_housing + + X, y = fetch_california_housing(return_X_y=True) + + atom = ATOMRegressor(X, y, random_state=1) + atom.run(models="Lasso", metric="r2", verbose=2) + ``` + + """ + + acronym = "Lasso" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "sklearnex", "cuml"] + + _module = "linear_model" + _estimators = CustomDict({"reg": "Lasso"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + alpha=Float(1e-3, 10, log=True), + selection=Cat(["cyclic", "random"]), + ) + + +class LeastAngleRegression(ClassRegModel): + """Least Angle Regression. + + Least-Angle Regression is a regression algorithm for + high-dimensional data. Lars is similar to forward stepwise + regression. At each step, it finds the feature most correlated + with the target. When there are multiple features having equal + correlation, instead of continuing along the same feature, it + proceeds in a direction equiangular between the features. + + Corresponding estimators are: + + - [Lars][] for regression tasks. + + Read more in sklearn's [documentation][larsdocs]. + + See Also + -------- + atom.models:BayesianRidge + atom.models:HuberRegression + atom.models:OrdinaryLeastSquares + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import fetch_california_housing + + X, y = fetch_california_housing(return_X_y=True) + + atom = ATOMRegressor(X, y, random_state=1) + atom.run(models="Lars", metric="r2", verbose=2) + ``` + + """ + + acronym = "Lars" + needs_scaling = True + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "linear_model" + _estimators = CustomDict({"reg": "Lars"}) + + +class LightGBM(ClassRegModel): + """Light Gradient Boosting Machine. + + LightGBM is a gradient boosting model that uses tree based learning + algorithms. It is designed to be distributed and efficient with the + following advantages: + + - Faster training speed and higher efficiency. + - Lower memory usage. + - Better accuracy. + - Capable of handling large-scale data. + + Corresponding estimators are: + + - [LGBMClassifier][] for classification tasks. + - [LGBMRegressor][] for regression tasks. + + Read more in LightGBM's [documentation][lgbdocs]. + + !!! info + Using LightGBM's [GPU acceleration][estimator-acceleration] + requires [additional software dependencies][lgb_gpu]. + + See Also + -------- + atom.models:CatBoost + atom.models:GradientBoostingMachine + atom.models:XGBoost + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="LGB", metric="f1", verbose=2) + ``` + + """ + + acronym = "LGB" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = "n_estimators" + supports_engines = ["lightgbm"] + + _module = "lightgbm.sklearn" + _estimators = CustomDict({"class": "LGBMClassifier", "reg": "LGBMRegressor"}) + + def _get_est(self, **params) -> PREDICTOR: + """Get the model's estimator with unpacked parameters. + + Returns + ------- + Predictor + Estimator instance. + + """ + # Custom lightgbm mapping for warnings + # PYTHONWARNINGS doesn't work since they go from C/C++ code to stdout + warns = dict(always=2, default=1, error=0, ignore=-1) + + return self._est_class( + verbose=params.pop("verbose", warns.get(self.warnings, -1)), + n_jobs=params.pop("n_jobs", self.n_jobs), + device=params.pop("device", "gpu" if self._gpu else "cpu"), + gpu_device_id=params.pop("gpu_device_id", self._device_id or -1), + random_state=params.pop("random_state", self.random_state), + **params, + ) + + def _fit_estimator( + self, + estimator: PREDICTOR, + data: tuple[DATAFRAME, PANDAS], + est_params_fit: dict, + validation: tuple[DATAFRAME, PANDAS] | None = None, + trial: Trial | None = None, + ): + """Fit the estimator and perform in-training validation. + + Parameters + ---------- + estimator: Predictor + Instance to fit. + + data: tuple + Training data of the form (X, y). + + est_params_fit: dict + Additional parameters for the estimator's fit method. + + validation: tuple or None + Validation data of the form (X, y). If None, no validation + is performed. + + trial: [Trial][] or None + Active trial (during hyperparameter tuning). + + Returns + ------- + Predictor + Fitted instance. + + """ + from lightgbm.callback import log_evaluation + + m = self._metric[0].name + params = est_params_fit.copy() + + callbacks = params.pop("callbacks", []) + [log_evaluation(-1)] + if trial and len(self._metric) == 1: + callbacks.append(LightGBMPruningCallback(trial, m, "valid_1")) + + eval_metric = None + if getattr(self, "_metric", None): + eval_metric = LGBMetric(self._metric[0], task=self.task) + + try: + estimator.fit( + *data, + eval_set=[data, validation] if validation else None, + eval_metric=params.pop("eval_metric", eval_metric), + callbacks=callbacks, + **params, + ) + except TrialPruned as ex: + # Add the pruned step to the output + step = str(ex).split(" ")[-1][:-1] + steps = estimator.get_params()[self.has_validation] + trial.params[self.has_validation] = f"{step}/{steps}" + + trial.set_user_attr("estimator", estimator) + raise ex + + if validation: + # Create evals attribute with train and validation scores + self._evals[f"{m}_train"] = estimator.evals_result_["training"][m] + self._evals[f"{m}_test"] = estimator.evals_result_["valid_1"][m] + + return estimator + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + n_estimators=Int(20, 500, step=10), + learning_rate=Float(0.01, 1.0, log=True), + max_depth=Int(-1, 17, step=2), + num_leaves=Int(20, 40), + min_child_weight=Float(1e-4, 100, log=True), + min_child_samples=Int(1, 30), + subsample=Float(0.5, 1.0, step=0.1), + colsample_bytree=Float(0.4, 1.0, step=0.1), + reg_alpha=Float(1e-4, 100, log=True), + reg_lambda=Float(1e-4, 100, log=True), + ) + + +class LinearDiscriminantAnalysis(ClassRegModel): + """Linear Discriminant Analysis. + + Linear Discriminant Analysis is a classifier with a linear + decision boundary, generated by fitting class conditional densities + to the data and using Bayes’ rule. The model fits a Gaussian + density to each class, assuming that all classes share the same + covariance matrix. + + Corresponding estimators are: + + - [LinearDiscriminantAnalysis][ldaclassifier] for classification tasks. + + Read more in sklearn's [documentation][ldadocs]. + + See Also + -------- + atom.models:LogisticRegression + atom.models:RadiusNearestNeighbors + atom.models:QuadraticDiscriminantAnalysis + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="LDA", metric="f1", verbose=2) + ``` + + """ + + acronym = "LDA" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "discriminant_analysis" + _estimators = CustomDict({"class": "LinearDiscriminantAnalysis"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if self._get_param("solver", params) == "svd": + params.pop("shrinkage") + + return params + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + solver=Cat(["svd", "lsqr", "eigen"]), + shrinkage=Cat([None, "auto", 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]), + ) + + +class LinearSVM(ClassRegModel): + """Linear Support Vector Machine. + + Similar to [SupportVectorMachine][] but with a linear kernel. + Implemented in terms of liblinear rather than libsvm, so it has + more flexibility in the choice of penalties and loss functions and + should scale better to large numbers of samples. + + Corresponding estimators are: + + - [LinearSVC][] for classification tasks. + - [LinearSVR][] for classification tasks. + + Read more in sklearn's [documentation][svmdocs]. + + See Also + -------- + atom.models:KNearestNeighbors + atom.models:StochasticGradientDescent + atom.models:SupportVectorMachine + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="lSVM", metric="f1", verbose=2) + ``` + + """ + + acronym = "lSVM" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "cuml"] + + _module = "svm" + _estimators = CustomDict({"class": "LinearSVC", "reg": "LinearSVR"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if self.goal == "class": + if self._get_param("loss", params) == "hinge": + # l1 regularization can't be combined with hinge + params.replace_value("penalty", "l2") + # l2 regularization can't be combined with hinge when dual=False + params.replace_value("dual", True) + elif self._get_param("loss", params) == "squared_hinge": + # l1 regularization can't be combined with squared_hinge when dual=True + if self._get_param("penalty", params) == "l1": + params.replace_value("dual", False) + elif self._get_param("loss", params) == "epsilon_insensitive": + params.replace_value("dual", True) + + return params + + def _get_est(self, **params) -> PREDICTOR: + """Get the estimator instance. + + Parameters + ---------- + **params + Unpacked hyperparameters for the estimator. + + Returns + ------- + Predictor + Estimator instance. + + """ + if self.engine.get("estimator") == "cuml" and self.goal == "class": + return self._est_class(probability=params.pop("probability", True), **params) + else: + return super()._get_est(**params) + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + dist = CustomDict() + if self.goal == "class": + dist["penalty"] = Cat(["l1", "l2"]) + dist["loss"] = Cat(["hinge", "squared_hinge"]) + else: + dist["loss"] = Cat(["epsilon_insensitive", "squared_epsilon_insensitive"]) + + dist["C"] = Float(1e-3, 100, log=True) + dist["dual"] = Cat([True, False]) + + if self.engine.get("estimator") == "cuml": + dist.pop("dual") + + return dist + + +class LogisticRegression(ClassRegModel): + """Logistic Regression. + + Logistic regression, despite its name, is a linear model for + classification rather than regression. Logistic regression is also + known in the literature as logit regression, maximum-entropy + classification (MaxEnt) or the log-linear classifier. In this model, + the probabilities describing the possible outcomes of a single trial + are modeled using a logistic function. + + Corresponding estimators are: + + - [LogisticRegression][] for classification tasks. + + Read more in sklearn's [documentation][lrdocs]. + + See Also + -------- + atom.models:GaussianProcess + atom.models:LinearDiscriminantAnalysis + atom.models:PassiveAggressive + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="RF", metric="f1", verbose=2) + ``` + + """ + + acronym = "LR" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "sklearnex", "cuml"] + + _module = "linear_model" + _estimators = CustomDict({"class": "LogisticRegression"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + # Limitations on penalty + solver combinations + penalty = self._get_param("penalty", params) + solver = self._get_param("solver", params) + cond_1 = penalty is None and solver == "liblinear" + cond_2 = penalty == "l1" and solver not in ("liblinear", "saga") + cond_3 = penalty == "elasticnet" and solver != "saga" + + if cond_1 or cond_2 or cond_3: + params.replace_value("penalty", "l2") # Change to default value + + if self._get_param("penalty", params) != "elasticnet": + params.pop("l1_ratio") + + if self._get_param("penalty", params) is None: + params.pop("C") + + return params + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + dist = CustomDict( + penalty=Cat([None, "l1", "l2", "elasticnet"]), + C=Float(1e-3, 100, log=True), + solver=Cat(["lbfgs", "newton-cg", "liblinear", "sag", "saga"]), + max_iter=Int(100, 1000, step=10), + l1_ratio=Float(0, 1.0, step=0.1), + ) + + if self._gpu: + dist.pop("solver") + dist.pop("penalty") # Only 'l2' is supported + elif self.engine.get("estimator") == "sklearnex": + dist["solver"] = Cat(["lbfgs", "newton-cg"]) + + return dist + + +class MultiLayerPerceptron(ClassRegModel): + """Multi-layer Perceptron. + + Multi-layer Perceptron is a supervised learning algorithm that + learns a function by training on a dataset. Given a set of features + and a target, it can learn a non-linear function approximator for + either classification or regression. It is different from logistic + regression, in that between the input and the output layer, there + can be one or more non-linear layers, called hidden layers. + + Corresponding estimators are: + + - [MLPClassifier][] for classification tasks. + - [MLPRegressor][] for regression tasks. + + Read more in sklearn's [documentation][mlpdocs]. + + See Also + -------- + atom.models:PassiveAggressive + atom.models:Perceptron + atom.models:StochasticGradientDescent + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="MLP", metric="f1", verbose=2) + ``` + + """ + + acronym = "MLP" + needs_scaling = True + accepts_sparse = True + native_multilabel = True + native_multioutput = False + has_validation = "max_iter" + supports_engines = ["sklearn"] + + _module = "neural_network" + _estimators = CustomDict({"class": "MLPClassifier", "reg": "MLPRegressor"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + # Drop layers when a previous layer has 0 neurons + drop = False + for param in [p for p in sorted(params) if p.startswith("hidden_layer")]: + if params[param] == 0 or drop: + drop = True + params.pop(param) + + if self._get_param("solver", params) != "sgd": + params.pop("learning_rate") + params.pop("power_t") + else: + params.pop("learning_rate_init") + + return params + + def _trial_to_est(self, params: CustomDict) -> CustomDict: + """Convert trial's hyperparameters to parameters for the estimator. + + Parameters + ---------- + params: CustomDict + Trial's hyperparameters. + + Returns + ------- + CustomDict + Estimator's hyperparameters. + + """ + params = super()._trial_to_est(params) + + hidden_layer_sizes = [] + for param in [p for p in sorted(params) if p.startswith("hidden_layer")]: + hidden_layer_sizes.append(params.pop(param)) + + if hidden_layer_sizes: + params.insert(0, "hidden_layer_sizes", tuple(hidden_layer_sizes)) + + return params + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + dist = CustomDict( + hidden_layer_1=Int(10, 100), + hidden_layer_2=Int(0, 100), + hidden_layer_3=Int(0, 10), + activation=Cat(["identity", "logistic", "tanh", "relu"]), + solver=Cat(["lbfgs", "sgd", "adam"]), + alpha=Float(1e-4, 0.1, log=True), + batch_size=Cat(["auto", 8, 16, 32, 64, 128, 256]), + learning_rate=Cat(["constant", "invscaling", "adaptive"]), + learning_rate_init=Float(1e-3, 0.1, log=True), + power_t=Float(0.1, 0.9, step=0.1), + max_iter=Int(50, 500, step=10), + ) + + # Drop layers if sizes are specified by user + return dist[3:] if "hidden_layer_sizes" in self._est_params else dist + + +class MultinomialNB(ClassRegModel): + """Multinomial Naive Bayes. + + MultinomialNB implements the Naive Bayes algorithm for multinomially + distributed data, and is one of the two classic Naive Bayes variants + used in text classification (where the data are typically + represented as word vector counts, although tf-idf vectors are also + known to work well in practice). + + Corresponding estimators are: + + - [MultinomialNB][multinomialnbclass] for classification tasks. + + Read more in sklearn's [documentation][mnbdocs]. + + See Also + -------- + atom.models:BernoulliNB + atom.models:ComplementNB + atom.models:GaussianNB + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="MNB", metric="f1", verbose=2) + ``` + + """ + + acronym = "MNB" + needs_scaling = False + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "cuml"] + + _module = "naive_bayes" + _estimators = CustomDict({"class": "MultinomialNB"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + alpha=Float(0.01, 10, log=True), + fit_prior=Cat([True, False]), + ) + + +class OrdinaryLeastSquares(ClassRegModel): + """Linear Regression. + + Ordinary Least Squares is just linear regression without any + regularization. It fits a linear model with coefficients `w=(w1, + ..., wp)` to minimize the residual sum of squares between the + observed targets in the dataset, and the targets predicted by the + linear approximation. + + Corresponding estimators are: + + - [LinearRegression][] for regression tasks. + + Read more in sklearn's [documentation][olsdocs]. + + See Also + -------- + atom.models:ElasticNet + atom.models:Lasso + atom.models:Ridge + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import fetch_california_housing + + X, y = fetch_california_housing(return_X_y=True) + + atom = ATOMRegressor(X, y, random_state=1) + atom.run(models="OLS", metric="r2", verbose=2) + ``` + + """ + + acronym = "OLS" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "sklearnex", "cuml"] + + _module = "linear_model" + _estimators = CustomDict({"reg": "LinearRegression"}) + + +class OrthogonalMatchingPursuit(ClassRegModel): + """Orthogonal Matching Pursuit. + + Orthogonal Matching Pursuit implements the OMP algorithm for + approximating the fit of a linear model with constraints imposed + on the number of non-zero coefficients. + + Corresponding estimators are: + + - [OrthogonalMatchingPursuit][] for regression tasks. + + Read more in sklearn's [documentation][ompdocs]. + + See Also + -------- + atom.models:Lasso + atom.models:LeastAngleRegression + atom.models:OrdinaryLeastSquares + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import fetch_california_housing + + X, y = fetch_california_housing(return_X_y=True) + + atom = ATOMRegressor(X, y, random_state=1) + atom.run(models="OMP", metric="r2", verbose=2) + ``` + + """ + + acronym = "OMP" + needs_scaling = True + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "linear_model" + _estimators = CustomDict({"reg": "OrthogonalMatchingPursuit"}) + + +class PassiveAggressive(ClassRegModel): + """Passive Aggressive. + + The passive-aggressive algorithms are a family of algorithms for + large-scale learning. They are similar to the Perceptron in that + they do not require a learning rate. However, contrary to the + [Perceptron][], they include a regularization parameter `C`. + + Corresponding estimators are: + + - [PassiveAggressiveClassifier][] for classification tasks. + - [PassiveAggressiveRegressor][] for classification tasks. + + Read more in sklearn's [documentation][padocs]. + + See Also + -------- + atom.models:MultiLayerPerceptron + atom.models:Perceptron + atom.models:StochasticGradientDescent + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="PA", metric="f1", verbose=2) + ``` + + """ + + acronym = "PA" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = "max_iter" + supports_engines = ["sklearn"] + + _module = "linear_model" + _estimators = CustomDict( + {"class": "PassiveAggressiveClassifier", "reg": "PassiveAggressiveRegressor"} + ) + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + if self.goal == "class": + loss = ["hinge", "squared_hinge"] + else: + loss = ["epsilon_insensitive", "squared_epsilon_insensitive"] + + return CustomDict( + C=Float(1e-3, 100, log=True), + max_iter=Int(500, 1500, step=50), + loss=Cat(loss), + average=Cat([True, False]), + ) + + +class Perceptron(ClassRegModel): + """Linear Perceptron classification. + + The Perceptron is a simple classification algorithm suitable for + large scale learning. By default: + + * It does not require a learning rate. + * It is not regularized (penalized). + * It updates its model only on mistakes. + + The last characteristic implies that the Perceptron is slightly + faster to train than [StochasticGradientDescent][] with the hinge + loss and that the resulting models are sparser. + + Corresponding estimators are: + + - [Perceptron][percclassifier] for classification tasks. + + Read more in sklearn's [documentation][percdocs]. + + See Also + -------- + atom.models:MultiLayerPerceptron + atom.models:PassiveAggressive + atom.models:StochasticGradientDescent + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="Perc", metric="f1", verbose=2) + ``` + + """ + + acronym = "Perc" + needs_scaling = True + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = "max_iter" + supports_engines = ["sklearn"] + + _module = "linear_model" + _estimators = CustomDict({"class": "Perceptron"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if self._get_param("penalty", params) != "elasticnet": + params.pop("l1_ratio") + + return params + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + penalty=Cat([None, "l2", "l1", "elasticnet"]), + alpha=Float(1e-4, 10, log=True), + l1_ratio=Float(0.1, 0.9, step=0.1), + max_iter=Int(500, 1500, step=50), + eta0=Float(1e-2, 10, log=True), + ) + + +class QuadraticDiscriminantAnalysis(ClassRegModel): + """Quadratic Discriminant Analysis. + + Quadratic Discriminant Analysis is a classifier with a quadratic + decision boundary, generated by fitting class conditional densities + to the data and using Bayes’ rule. The model fits a Gaussian + density to each class, assuming that all classes share the same + covariance matrix. + + Corresponding estimators are: + + - [QuadraticDiscriminantAnalysis][qdaclassifier] for classification tasks. + + Read more in sklearn's [documentation][ldadocs]. + + See Also + -------- + atom.models:LinearDiscriminantAnalysis + atom.models:LogisticRegression + atom.models:RadiusNearestNeighbors + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="QDA", metric="f1", verbose=2) + ``` + + """ + + acronym = "QDA" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn"] + + _module = "discriminant_analysis" + _estimators = CustomDict({"class": "QuadraticDiscriminantAnalysis"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict(reg_param=Float(0, 1.0, step=0.1)) + + +class RadiusNearestNeighbors(ClassRegModel): + """Radius Nearest Neighbors. + + Radius Nearest Neighbors implements the nearest neighbors vote, + where the neighbors are selected from within a given radius. For + regression, the target is predicted by local interpolation of the + targets associated of the nearest neighbors in the training set. + + !!! warning + * The `radius` parameter should be tuned to the data at hand or + the model will perform poorly. + * If outliers are detected, the estimator raises an exception + unless `est_params={"outlier_label": "most_frequent"}` is used. + + Corresponding estimators are: + + - [RadiusNeighborsClassifier][] for classification tasks. + - [RadiusNeighborsRegressor][] for regression tasks. + + Read more in sklearn's [documentation][knndocs]. + + See Also + -------- + atom.models:KNearestNeighbors + atom.models:LinearDiscriminantAnalysis + atom.models:QuadraticDiscriminantAnalysis + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run( + models="RNN", + metric="f1", + est_params={"outlier_label": "most_frequent"}, + verbose=2, + ) + ``` + + """ + + acronym = "RNN" + needs_scaling = True + accepts_sparse = True + native_multilabel = True + native_multioutput = True + has_validation = None + supports_engines = ["sklearn"] + + _module = "neighbors" + _estimators = CustomDict( + {"class": "RadiusNeighborsClassifier", "reg": "RadiusNeighborsRegressor"} + ) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + radius=Float(1e-2, 100), + weights=Cat(["uniform", "distance"]), + algorithm=Cat(["auto", "ball_tree", "kd_tree", "brute"]), + leaf_size=Int(20, 40), + p=Int(1, 2), + ) + + +class RandomForest(ClassRegModel): + """Random Forest. + + Random forests are an ensemble learning method that operate by + constructing a multitude of decision trees at training time and + outputting the class that is the mode of the classes + (classification) or mean prediction (regression) of the individual + trees. Random forests correct for decision trees' habit of + overfitting to their training set. + + Corresponding estimators are: + + - [RandomForestClassifier][] for classification tasks. + - [RandomForestRegressor][] for regression tasks. + + Read more in sklearn's [documentation][adabdocs]. + + !!! warning + cuML's implementation of [RandomForestClassifier][cumlrf] only + supports predictions on dtype `float32`. Convert all dtypes + before calling atom's [run][atomclassifier-run] method to avoid + exceptions. + + See Also + -------- + atom.models:DecisionTree + atom.models:ExtraTrees + atom.models:HistGradientBoosting + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="RF", metric="f1", verbose=2) + ``` + + """ + + acronym = "RF" + needs_scaling = False + accepts_sparse = True + native_multilabel = True + native_multioutput = True + has_validation = None + supports_engines = ["sklearn", "sklearnex", "cuml"] + + _module = "ensemble" + _estimators = CustomDict( + {"class": "RandomForestClassifier", "reg": "RandomForestRegressor"} + ) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if not self._get_param("bootstrap", params): + params.pop("max_samples") + + return params + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + if self.goal == "class": + criterion = ["gini", "entropy"] + else: + if self.engine.get("estimator") == "cuml": + criterion = ["mse", "poisson", "gamma", "inverse_gaussian"] + else: + criterion = ["squared_error", "absolute_error", "poisson"] + + dist = CustomDict( + n_estimators=Int(10, 500, step=10), + criterion=Cat(criterion), + max_depth=Cat([None, *range(1, 17)]), + min_samples_split=Int(2, 20), + min_samples_leaf=Int(1, 20), + max_features=Cat([None, "sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]), + bootstrap=Cat([True, False]), + max_samples=Cat([None, 0.5, 0.6, 0.7, 0.8, 0.9]), + ccp_alpha=Float(0, 0.035, step=0.005), + ) + + if self.engine.get("estimator") == "sklearnex": + dist.pop("criterion") + dist.pop("ccp_alpha") + elif self.engine.get("estimator") == "cuml": + dist.replace_key("criterion", "split_criterion") + dist["max_depth"] = Int(1, 17) + dist["max_features"] = Cat(["sqrt", "log2", 0.5, 0.6, 0.7, 0.8, 0.9]) + dist["max_samples"] = Float(0.5, 0.9, step=0.1) + dist.pop("ccp_alpha") + + return dist + + +class Ridge(ClassRegModel): + """Linear least squares with l2 regularization. + + If classifier, it first converts the target values into {-1, 1} + and then treats the problem as a regression task. + + Corresponding estimators are: + + - [RidgeClassifier][] for classification tasks. + - [Ridge][ridgeregressor] for regression tasks. + + Read more in sklearn's [documentation][ridgedocs]. + + !!! warning + Engines `sklearnex` and `cuml` are only available for regression + tasks. + + See Also + -------- + atom.models:BayesianRidge + atom.models:ElasticNet + atom.models:Lasso + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import fetch_california_housing + + X, y = fetch_california_housing(return_X_y=True) + + atom = ATOMRegressor(X, y, random_state=1) + atom.run(models="Ridge", metric="r2", verbose=2) + ``` + + """ + + acronym = "Ridge" + needs_scaling = True + accepts_sparse = True + native_multilabel = True + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "sklearnex", "cuml"] + + _module = "linear_model" + _estimators = CustomDict({"class": "RidgeClassifier", "reg": "Ridge"}) + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + dist = CustomDict( + alpha=Float(1e-3, 10, log=True), + solver=Cat(["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]), + ) + + if self.goal == "reg": + if self.engine.get("estimator") == "sklearnex": + dist.pop("solver") # Only supports 'auto' + elif self.engine.get("estimator") == "cuml": + dist["solver"] = Cat(["eig", "svd", "cd"]) + + return dist + + +class StochasticGradientDescent(ClassRegModel): + """Stochastic Gradient Descent. + + Stochastic Gradient Descent is a simple yet very efficient approach + to fitting linear classifiers and regressors under convex loss + functions. Even though SGD has been around in the machine learning + community for a long time, it has received a considerable amount of + attention just recently in the context of large-scale learning. + + Corresponding estimators are: + + - [SGDClassifier][] for classification tasks. + - [SGDRegressor][] for regression tasks. + + Read more in sklearn's [documentation][sgddocs]. + + See Also + -------- + atom.models:MultiLayerPerceptron + atom.models:PassiveAggressive + atom.models:SupportVectorMachine + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="SGD", metric="f1", verbose=2) + ``` + + """ + + acronym = "SGD" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = "max_iter" + supports_engines = ["sklearn"] + + _module = "linear_model" + _estimators = CustomDict({"class": "SGDClassifier", "reg": "SGDRegressor"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if self._get_param("penalty", params) != "elasticnet": + params.pop("l1_ratio") + + if self._get_param("learning_rate", params) == "optimal": + params.pop("eta0") + + return params + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + loss = [ + "hinge", + "log_loss", + "modified_huber", + "squared_hinge", + "perceptron", + "squared_error", + "huber", + "epsilon_insensitive", + "squared_epsilon_insensitive", + ] + + return CustomDict( + loss=Cat(loss if self.goal == "class" else loss[-4:]), + penalty=Cat([None, "l1", "l2", "elasticnet"]), + alpha=Float(1e-4, 1.0, log=True), + l1_ratio=Float(0.1, 0.9, step=0.1), + max_iter=Int(500, 1500, step=50), + epsilon=Float(1e-4, 1.0, log=True), + learning_rate=Cat(["constant", "invscaling", "optimal", "adaptive"]), + eta0=Float(1e-2, 10, log=True), + power_t=Float(0.1, 0.9, step=0.1), + average=Cat([True, False]), + ) + + +class SupportVectorMachine(ClassRegModel): + """Support Vector Machine. + + The implementation of the Support Vector Machine is based on libsvm. + The fit time scales at least quadratically with the number of + samples and may be impractical beyond tens of thousands of samples. + For large datasets consider using a [LinearSVM][] or a + [StochasticGradientDescent][] model instead. + + Corresponding estimators are: + + - [SVC][] for classification tasks. + - [SVR][] for classification tasks. + + Read more in sklearn's [documentation][svmdocs]. + + See Also + -------- + atom.models:LinearSVM + atom.models:MultiLayerPerceptron + atom.models:StochasticGradientDescent + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="SVM", metric="f1", verbose=2) + ``` + + """ + + acronym = "SVM" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = None + supports_engines = ["sklearn", "sklearnex", "cuml"] + + _module = "svm" + _estimators = CustomDict({"class": "SVC", "reg": "SVR"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if self.goal == "class": + params.pop("epsilon") + + kernel = self._get_param("kernel", params) + if kernel == "poly": + params.replace_value("gamma", "scale") # Crashes in combination with "auto" + else: + params.pop("degree") + + if kernel not in ("rbf", "poly", "sigmoid"): + params.pop("gamma") + + if kernel not in ("poly", "sigmoid"): + params.pop("coef0") + + return params + + def _get_est(self, **params) -> PREDICTOR: + """Get the model's estimator with unpacked parameters. + + Returns + ------- + Predictor + Estimator instance. + + """ + if self.engine.get("estimator") == "cuml" and self.goal == "class": + return self._est_class( + probability=params.pop("probability", True), + random_state=params.pop("random_state", self.random_state), + **params) + else: + return super()._get_est(**params) + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + dist = CustomDict( + C=Float(1e-3, 100, log=True), + kernel=Cat(["linear", "poly", "rbf", "sigmoid"]), + degree=Int(2, 5), + gamma=Cat(["scale", "auto"]), + coef0=Float(-1.0, 1.0), + epsilon=Float(1e-3, 100, log=True), + shrinking=Cat([True, False]), + ) + + if self.engine.get("estimator") == "cuml": + dist.pop("epsilon") + dist.pop("shrinking") + + return dist + + +class XGBoost(ClassRegModel): + """Extreme Gradient Boosting. + + XGBoost is an optimized distributed gradient boosting model + designed to be highly efficient, flexible and portable. XGBoost + provides a parallel tree boosting that solve many data science + problems in a fast and accurate way. + + Corresponding estimators are: + + - [XGBClassifier][] for classification tasks. + - [XGBRegressor][] for regression tasks. + + Read more in XGBoost's [documentation][xgbdocs]. + + See Also + -------- + atom.models:CatBoost + atom.models:GradientBoostingMachine + atom.models:LightGBM + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(models="XGB", metric="f1", verbose=2) + ``` + + """ + + acronym = "XGB" + needs_scaling = True + accepts_sparse = True + native_multilabel = False + native_multioutput = False + has_validation = "n_estimators" + supports_engines = ["xgboost"] + + _module = "xgboost" + _estimators = CustomDict({"class": "XGBClassifier", "reg": "XGBRegressor"}) + + def _get_est(self, **params) -> PREDICTOR: + """Get the model's estimator with unpacked parameters. + + Returns + ------- + Predictor + Estimator instance. + + """ + eval_metric = None + if getattr(self, "_metric", None): + eval_metric = XGBMetric(self._metric[0], task=self.task) + + return self._est_class( + eval_metric=params.pop("eval_metric", eval_metric), + n_jobs=params.pop("n_jobs", self.n_jobs), + tree_method=params.pop("tree_method", "gpu_hist" if self._gpu else None), + gpu_id=self._device_id, + verbosity=params.pop("verbosity", 0), + random_state=params.pop("random_state", self.random_state), + **params, + ) + + def _fit_estimator( + self, + estimator: PREDICTOR, + data: tuple[DATAFRAME, PANDAS], + est_params_fit: dict, + validation: tuple[DATAFRAME, PANDAS] | None = None, + trial: Trial | None = None, + ): + """Fit the estimator and perform in-training validation. + + Parameters + ---------- + estimator: Predictor + Instance to fit. + + data: tuple + Training data of the form (X, y). + + est_params_fit: dict + Additional parameters for the estimator's fit method. + + validation: tuple or None + Validation data of the form (X, y). If None, no validation + is performed. + + trial: [Trial][] or None + Active trial (during hyperparameter tuning). + + Returns + ------- + Predictor + Fitted instance. + + """ + m = self._metric[0].name + params = est_params_fit.copy() + + callbacks = params.pop("callbacks", []) + if trial and len(self._metric) == 1: + callbacks.append(XGBoostPruningCallback(trial, f"validation_1-{m}")) + + try: + estimator.set_params(callbacks=callbacks) + estimator.fit( + *data, + eval_set=[data, validation] if validation else None, + verbose=params.get("verbose", False), + **params, + ) + except TrialPruned as ex: + # Add the pruned step to the output + step = str(ex).split(" ")[-1][:-1] + steps = estimator.get_params()[self.has_validation] + trial.params[self.has_validation] = f"{step}/{steps}" + + trial.set_user_attr("estimator", estimator) + raise ex + + if validation: + # Create evals attribute with train and validation scores + # Negative because minimizes the function + results = estimator.evals_result() + self._evals[f"{m}_train"] = np.negative(results["validation_0"][m]) + self._evals[f"{m}_test"] = np.negative(results["validation_1"][m]) + + return estimator + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + n_estimators=Int(20, 500, step=10), + learning_rate=Float(0.01, 1.0, log=True), + max_depth=Int(1, 20), + gamma=Float(0, 1.0), + min_child_weight=Int(1, 10), + subsample=Float(0.5, 1.0, step=0.1), + colsample_bytree=Float(0.4, 1.0, step=0.1), + reg_alpha=Float(1e-4, 100, log=True), + reg_lambda=Float(1e-4, 100, log=True), + ) diff --git a/atom/models/ensembles.py b/atom/models/ensembles.py new file mode 100644 index 000000000..39d890983 --- /dev/null +++ b/atom/models/ensembles.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module containing all ensemble models. + +""" + +from __future__ import annotations + +from atom.basemodel import ClassRegModel +from atom.pipeline import Pipeline +from atom.utils.types import PREDICTOR +from atom.utils.utils import ClassMap, CustomDict, sign + + +class Stacking(ClassRegModel): + """Stacking ensemble. + + Parameters + ---------- + models: ClassMap + Models from which to build the ensemble. + + **kwargs + Additional keyword arguments for the estimator. + + """ + + acronym = "Stack" + needs_scaling = False + has_validation = None + native_multilabel = False + native_multioutput = False + supports_engines = [] + + _module = "atom.ensembles" + _estimators = CustomDict({"class": "StackingClassifier", "reg": "StackingRegressor"}) + + def __init__(self, models: ClassMap, **kwargs): + self._models = models + kw_model = {k: v for k, v in kwargs.items() if k in sign(ClassRegModel.__init__)} + super().__init__(**kw_model) + self._est_params = {k: v for k, v in kwargs.items() if k not in kw_model} + + def _get_est(self, **params) -> PREDICTOR: + """Get the model's estimator with unpacked parameters. + + Returns + ------- + Predictor + Estimator instance. + + """ + estimators = [] + for m in self._models: + if m.scaler: + name = f"pipeline_{m.name}" + est = Pipeline([("scaler", m.scaler), (m.name, m.estimator)]) + else: + name = m.name + est = m.estimator + + estimators.append((name, est)) + + return self._est_class( + estimators=estimators, + n_jobs=params.pop("n_jobs", self.n_jobs), + **params, + ) + + +class Voting(ClassRegModel): + """Voting ensemble. + + Parameters + ---------- + models: ClassMap + Models from which to build the ensemble. + + **kwargs + Additional keyword arguments for the estimator. + + """ + + acronym = "Vote" + needs_scaling = False + has_validation = None + native_multilabel = False + native_multioutput = False + supports_engines = [] + + _module = "atom.ensembles" + _estimators = CustomDict({"class": "VotingClassifier", "reg": "VotingRegressor"}) + + def __init__(self, models: ClassMap, **kwargs): + self._models = models + kw_model = {k: v for k, v in kwargs.items() if k in sign(ClassRegModel.__init__)} + super().__init__(**kw_model) + self._est_params = {k: v for k, v in kwargs.items() if k not in kw_model} + + if self._est_params.get("voting") == "soft": + for m in self._models: + if not hasattr(m.estimator, "predict_proba"): + raise ValueError( + "Invalid value for the voting parameter. If " + "'soft', all models in the ensemble should have " + f"a predict_proba method, got {m._fullname}." + ) + + def _get_est(self, **params) -> PREDICTOR: + """Get the model's estimator with unpacked parameters. + + Returns + ------- + Predictor + Estimator instance. + + """ + estimators = [] + for m in self._models: + if m.scaler: + name = f"pipeline_{m.name}" + est = Pipeline([("scaler", m.scaler), (m.name, m.estimator)]) + else: + name = m.name + est = m.estimator + + estimators.append((name, est)) + + return self._est_class( + estimators=estimators, + n_jobs=params.pop("n_jobs", self.n_jobs), + **params, + ) diff --git a/atom/models/ts.py b/atom/models/ts.py new file mode 100644 index 000000000..b3680a95a --- /dev/null +++ b/atom/models/ts.py @@ -0,0 +1,535 @@ +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module containing all time series models. + +""" + +from __future__ import annotations + +from optuna.distributions import CategoricalDistribution as Cat +from optuna.distributions import IntDistribution as Int +from optuna.trial import Trial + +from atom.basemodel import ForecastModel +from atom.utils.utils import CustomDict + + +class ARIMA(ForecastModel): + """Autoregressive Integrated Moving Average Model. + + Seasonal ARIMA models and exogeneous input is supported, hence this + estimator is capable of fitting SARIMA, ARIMAX, and SARIMAX. + + An ARIMA model, is a generalization of an autoregressive moving + average (ARMA) model, and is fitted to time-series data in an effort + to forecast future points. ARIMA models can be especially + efficacious in cases where data shows evidence of non-stationarity. + + The "AR" part of ARIMA indicates that the evolving variable of + interest is regressed on its own lagged (i.e., prior observed) + values. The "MA" part indicates that the regression error is + actually a linear combination of error terms whose values occurred + contemporaneously and at various times in the past. The "I" (for + "integrated") indicates that the data values have been replaced with + the difference between their values and the previous values (and this + differencing process may have been performed more than once). + + Corresponding estimators are: + + - [ARIMA][arimaclass] for forecasting tasks. + + !!! warning + ARIMA often runs into numerical errors when optimizing the + hyperparameters. Possible solutions are: + + - Use the [AutoARIMA][] model instead. + - Use [`est_params`][directforecaster-est_params] to specify the + orders manually, e.g. `#!python atom.run("arima", n_trials=5, + est_params={"order": (1, 1, 0)})`. + - Use the `catch` parameter in [`ht_params`][directforecaster-ht_params] + to avoid raising every exception, e.g. `#!python atom.run("arima", + n_trials=5, ht_params={"catch": (Exception,)})`. + + See Also + -------- + atom.models:AutoARIMA + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_longley + + _, X = load_longley() + + atom = ATOMForecaster(X) + atom.run(models="ARIMA", verbose=2) + ``` + + """ + + acronym = "ARIMA" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = True + has_validation = None + supports_engines = ["sktime"] + + _module = "sktime.forecasting.arima" + _estimators = CustomDict({"fc": "ARIMA"}) + + _order = ("p", "d", "q") + _sorder = ("Ps", "Ds", "Qs", "S") + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + # If no seasonal periodicity, set seasonal components to zero + if self._get_param("S", params) == 0: + for p in self._sorder: + params.replace_value(p, 0) + + return params + + def _trial_to_est(self, params: CustomDict) -> CustomDict: + """Convert trial's hyperparameters to parameters for the estimator. + + Parameters + ---------- + params: CustomDict + Trial's hyperparameters. + + Returns + ------- + CustomDict + Estimator's hyperparameters. + + """ + params = super()._trial_to_est(params) + + # Convert params to hyperparameters order and seasonal_order + if all(p in params for p in self._sorder): + params.insert(0, "seasonal_order", tuple(params.pop(p) for p in self._sorder)) + if all(p in params for p in self._order): + params.insert(0, "order", tuple(params.pop(p) for p in self._order)) + + return params + + def _get_distributions(self) -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + methods = ["newton", "nm", "bfgs", "lbfgs", "powell", "cg", "ncg", "basinhopping"] + + dist = CustomDict( + p=Int(0, 2), + d=Int(0, 1), + q=Int(0, 2), + Ps=Int(0, 2), + Ds=Int(0, 1), + Qs=Int(0, 2), + S=Cat([0, 4, 6, 7, 12]), + method=Cat(methods), + maxiter=Int(50, 200, step=10), + with_intercept=Cat([True, False]), + ) + + # Drop order and seasonal_order params if specified by user + if "order" in self._est_params: + for p in self._order: + dist.pop(p) + if "seasonal_order" in self._est_params: + for p in self._sorder: + dist.pop(p) + + return dist + + +class AutoARIMA(ForecastModel): + """Automatic Autoregressive Integrated Moving Average Model. + + [ARIMA][] implementation that includes automated fitting of + (S)ARIMA(X) hyperparameters (p, d, q, P, D, Q). The AutoARIMA + algorithm seeks to identify the most optimal parameters for an + ARIMA model, settling on a single fitted ARIMA model. This process + is based on the commonly-used R function. + + AutoARIMA works by conducting differencing tests (i.e., + Kwiatkowski–Phillips–Schmidt–Shin, Augmented Dickey-Fuller or + Phillips–Perron) to determine the order of differencing, d, and + then fitting models within defined ranges. AutoARIMA also seeks + to identify the optimal P and Q hyperparameters after conducting + the Canova-Hansen to determine the optimal order of seasonal + differencing. + + Note that due to stationarity issues, AutoARIMA might not find a + suitable model that will converge. If this is the case, a ValueError + is thrown suggesting stationarity-inducing measures be taken prior + to re-fitting or that a new range of order values be selected. + + Corresponding estimators are: + + - [AutoARIMA][autoarimaclass] for forecasting tasks. + + See Also + -------- + atom.models:ARIMA + atom.models:ETS + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_longley + + _, X = load_longley() + + atom = ATOMForecaster(X, random_state=1) + atom.run(models="autoarima", verbose=2) + ``` + + """ + + acronym = "AutoARIMA" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = True + has_validation = None + supports_engines = ["sktime"] + + _module = "sktime.forecasting.arima" + _estimators = CustomDict({"fc": "AutoARIMA"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + methods = ["newton", "nm", "bfgs", "lbfgs", "powell", "cg", "ncg", "basinhopping"] + + return CustomDict( + method=Cat(methods), + maxiter=Int(50, 200, step=10), + with_intercept=Cat([True, False]), + ) + + +class ExponentialSmoothing(ForecastModel): + """Exponential Smoothing forecaster. + + Holt-Winters exponential smoothing forecaster. The default settings + use simple exponential smoothing, without trend and seasonality + components. + + Corresponding estimators are: + + - [ExponentialSmoothing][esclass] for forecasting tasks. + + See Also + -------- + atom.models:ARIMA + atom.models:ETS + atom.models:PolynomialTrend + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.run(models="ES", verbose=2) + ``` + + """ + + acronym = "ES" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = True + has_validation = None + supports_engines = ["sktime"] + + _module = "sktime.forecasting.exp_smoothing" + _estimators = CustomDict({"fc": "ExponentialSmoothing"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + if self._get_param("trend", params) is None: + params.pop("damped_trend") + + if self._get_param("sp", params) is None: + params.pop("seasonal") + + return params + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + methods = ["L-BFGS-B", "TNC", "SLSQP", "Powell", "trust-constr", "bh", "ls"] + + return CustomDict( + trend=Cat(["add", "mul", None]), + damped_trend=Cat([True, False]), + seasonal=Cat(["add", "mul", None]), + sp=Cat([4, 6, 7, 12, None]), + use_boxcox=Cat([True, False]), + initialization_method=Cat(["estimated", "heuristic"]), + method=Cat(methods), + ) + + +class ETS(ForecastModel): + """ETS model with automatic fitting capabilities. + + The ETS models are a family of time series models with an + underlying state space model consisting of a level component, + a trend component (T), a seasonal component (S), and an error + term (E). + + Corresponding estimators are: + + - [AutoETS][] for forecasting tasks. + + See Also + -------- + atom.models:ARIMA + atom.models:ExponentialSmoothing + atom.models:PolynomialTrend + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.run(models="ETS", verbose=2) + + ``` + + """ + + acronym = "ETS" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = True + has_validation = None + supports_engines = ["sktime"] + + _module = "sktime.forecasting.ets" + _estimators = CustomDict({"fc": "AutoETS"}) + + def _get_parameters(self, trial: Trial) -> CustomDict: + """Get the trial's hyperparameters. + + Parameters + ---------- + trial: [Trial][] + Current trial. + + Returns + ------- + CustomDict + Trial's hyperparameters. + + """ + params = super()._get_parameters(trial) + + # If no seasonal periodicity, set seasonal components to zero + if self._get_param("sp", params) == 1: + params.pop("seasonal") + + return params + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + error=Cat(["add", "mul"]), + trend=Cat(["add", "mul", None]), + damped_trend=Cat([True, False]), + seasonal=Cat(["add", "mul", None]), + sp=Cat([1, 4, 6, 7, 12]), + initialization_method=Cat(["estimated", "heuristic"]), + maxiter=Int(500, 2000, step=100), + auto=Cat([True, False]), + information_criterion=Cat(["aic", "bic", "aicc"]), + ) + + +class NaiveForecaster(ForecastModel): + """Naive Forecaster. + + NaiveForecaster is a dummy forecaster that makes forecasts using + simple strategies based on naive assumptions about past trends + continuing. When used in [multivariate][] tasks, each column is + forecasted with the same strategy. + + Corresponding estimators are: + + - [NaiveForecaster][naiveforecasterclass] for forecasting tasks. + + See Also + -------- + atom.models:ExponentialSmoothing + atom.models:Dummy + atom.models:PolynomialTrend + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.run(models="NF", verbose=2) + + ``` + + """ + + acronym = "NF" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = True + has_validation = None + supports_engines = ["sktime"] + + _module = "sktime.forecasting.naive" + _estimators = CustomDict({"fc": "NaiveForecaster"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict(strategy=Cat(["last", "mean", "drift"])) + + +class PolynomialTrend(ForecastModel): + """Polynomial Trend forecaster. + + Forecast time series data with a polynomial trend, using a sklearn + [LinearRegression][] class to regress values of time series on + index, after extraction of polynomial features. + + Corresponding estimators are: + + - [PolynomialTrendForecaster][] for forecasting tasks. + + See Also + -------- + atom.models:ARIMA + atom.models:ETS + atom.models:NaiveForecaster + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.run(models="PT", verbose=2) + ``` + + """ + + acronym = "PT" + needs_scaling = False + accepts_sparse = False + native_multilabel = False + native_multioutput = True + has_validation = None + supports_engines = ["sktime"] + + _module = "sktime.forecasting.trend" + _estimators = CustomDict({"fc": "PolynomialTrendForecaster"}) + + @staticmethod + def _get_distributions() -> CustomDict: + """Get the predefined hyperparameter distributions. + + Returns + ------- + CustomDict + Hyperparameter distributions. + + """ + return CustomDict( + degree=Int(1, 5), + with_intercept=Cat([True, False]), + ) diff --git a/atom/nlp.py b/atom/nlp.py index ee3f79a07..5d43a9e14 100644 --- a/atom/nlp.py +++ b/atom/nlp.py @@ -949,7 +949,7 @@ class Vectorizer(BaseEstimator, TransformerMixin, BaseTransformer): def __init__( self, - strategy: str = "bow", + strategy: Literal["bow", "tfidf", "hashing"] = "bow", *, return_sparse: BOOL = True, device: str = "cpu", @@ -1001,17 +1001,11 @@ def fit(self, X: FEATURES, y: TARGET | None = None) -> Vectorizer: hashing="HashingVectorizer", ) - if self.strategy in strategies: - estimator = self._get_est_class( - name=strategies[self.strategy], - module="feature_extraction.text", - ) - self._estimator = estimator(**self.kwargs) - else: - raise ValueError( - "Invalid value for the strategy parameter, got " - f"{self.strategy}. Choose from: {', '.join(strategies)}." - ) + estimator = self._get_est_class( + name=strategies[self.strategy], + module="feature_extraction.text", + ) + self._estimator = estimator(**self.kwargs) self.log("Fitting Vectorizer...", 1) self._estimator.fit(X[corpus]) diff --git a/atom/pipeline.py b/atom/pipeline.py index 9c9fcd2e6..3f68174cb 100644 --- a/atom/pipeline.py +++ b/atom/pipeline.py @@ -22,7 +22,8 @@ from typeguard import typechecked from atom.utils.types import ( - BOOL, DATAFRAME, ESTIMATOR, FEATURES, FLOAT, SEQUENCE, SERIES, TARGET, INT + BOOL, DATAFRAME, ESTIMATOR, FEATURES, FLOAT, INT, PANDAS, SEQUENCE, SERIES, + TARGET, ) from atom.utils.utils import ( check_is_fitted, fit_one, fit_transform_one, transform_one, @@ -261,7 +262,7 @@ def transform( self, X: FEATURES | None = None, y: TARGET | None = None, - ) -> DATAFRAME | SERIES | tuple[DATAFRAME, SERIES]: + ) -> DATAFRAME | SERIES | tuple[DATAFRAME, PANDAS]: """Transform the data. Call `transform` on each transformer in the pipeline. The @@ -304,7 +305,7 @@ def fit_transform( X: FEATURES | None = None, y: TARGET | None = None, **fit_params, - ) -> DATAFRAME | SERIES | tuple[DATAFRAME, SERIES]: + ) -> DATAFRAME | SERIES | tuple[DATAFRAME, PANDAS]: """Fit the pipeline and transform the data. Parameters @@ -314,13 +315,15 @@ def fit_transform( X is ignored. None if the estimator only uses y. - y: int, str, dict, sequence or None, default=None + y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to X. - If None: y is ignored. - If int: Position of the target column in X. - If str: Name of the target column in X. - - Else: Array with shape=(n_samples,) to use as target. + - If sequence: Target array with shape=(n_samples,) or + sequence of column names or positions for multioutput tasks. + - If dataframe: Target columns for multioutput tasks. **fit_params Additional keyword arguments for the fit method. @@ -330,7 +333,7 @@ def fit_transform( dataframe Transformed feature set. Only returned if provided. - series + series or dataframe Transformed target column. Only returned if provided. """ @@ -352,7 +355,7 @@ def inverse_transform( self, X: FEATURES | None = None, y: TARGET | None = None, - ) -> DATAFRAME | SERIES | tuple[DATAFRAME, SERIES]: + ) -> DATAFRAME | SERIES | tuple[DATAFRAME, PANDAS]: """Inverse transform for each step in a reverse order. All estimators in the pipeline must implement the @@ -364,20 +367,22 @@ def inverse_transform( Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the pipeline only uses y. - y: int, str, dict, sequence or None, default=None + y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to X. - If None: y is ignored. - If int: Position of the target column in X. - If str: Name of the target column in X. - - Else: Array with shape=(n_samples,) to use as target. + - If sequence: Target array with shape=(n_samples,) or + sequence of column names or positions for multioutput tasks. + - If dataframe: Target columns for multioutput tasks. Returns ------- dataframe Transformed feature set. Only returned if provided. - series + series or dataframe Transformed target column. Only returned if provided. """ diff --git a/atom/plots.py b/atom/plots.py deleted file mode 100644 index 5da24fdfb..000000000 --- a/atom/plots.py +++ /dev/null @@ -1,8289 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Automated Tool for Optimized Modelling (ATOM) -Author: Mavs -Description: Module containing the plotting classes. - -""" - -from __future__ import annotations - -from collections import defaultdict -from contextlib import contextmanager -from dataclasses import dataclass -from datetime import datetime -from functools import reduce -from importlib.util import find_spec -from itertools import chain, cycle - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import plotly.express as px -import plotly.graph_objects as go -import shap -from joblib import Parallel, delayed -from mlflow.tracking import MlflowClient -from nltk.collocations import ( - BigramCollocationFinder, QuadgramCollocationFinder, - TrigramCollocationFinder, -) -from optuna.importance import FanovaImportanceEvaluator -from optuna.trial import TrialState -from optuna.visualization._parallel_coordinate import ( - _get_dims_from_info, _get_parallel_coordinate_info, -) -from optuna.visualization._terminator_improvement import _get_improvement_info -from optuna.visualization._utils import _is_log_scale -from plotly.colors import unconvert_from_RGB_255, unlabel_rgb -from scipy import stats -from scipy.stats.mstats import mquantiles -from sklearn.calibration import calibration_curve -from sklearn.inspection import partial_dependence, permutation_importance -from sklearn.metrics import ( - confusion_matrix, det_curve, precision_recall_curve, roc_curve, -) -from sklearn.utils import _safe_indexing -from sklearn.utils._bunch import Bunch -from sklearn.utils.metaestimators import available_if -from sktime.forecasting.base import ForecastingHorizon -from typeguard import typechecked - -from atom.utils.constants import PALETTE -from atom.utils.types import ( - BOOL, DATAFRAME, FEATURES, FLOAT, INDEX, INT, INT_TYPES, METRIC_SELECTOR, - MODEL, SCALAR, SEQUENCE, SERIES, SLICE, -) -from atom.utils.utils import ( - bk, check_canvas, check_dependency, check_hyperparams, check_predict_proba, - composed, crash, divide, get_best_score, get_corpus, get_custom_scorer, - has_attr, has_task, is_binary, is_multioutput, it, lst, plot_from_model, - rnd, to_rgb, -) - - -@dataclass -class Aesthetics: - """Keeps track of plot aesthetics.""" - - palette: SEQUENCE # Sequence of colors - title_fontsize: INT # Fontsize for titles - label_fontsize: INT # Fontsize for labels, legend and hoverinfo - tick_fontsize: INT # Fontsize for ticks - line_width: INT # Width of the line plots - marker_size: INT # Size of the markers - - -@typechecked -class BaseFigure: - """Base plotly figure. - - The instance stores the position of the current axes in grid, - as well as the models used for the plot (to track in mlflow). - - Parameters - ---------- - rows: int, default=1 - Number of subplot rows in the canvas. - - cols: int, default=1 - Number of subplot columns in the canvas. - - horizontal_spacing: float, default=0.05 - Space between subplot rows in normalized plot coordinates. - The spacing is relative to the figure's size. - - vertical_spacing: float, default=0.07 - Space between subplot cols in normalized plot coordinates. - The spacing is relative to the figure's size. - - palette: str or sequence, default="Prism" - Name or color sequence for the palette. - - is_canvas: bool, default=False - Whether the figure shows multiple plots. - - backend: str, default="plotly" - Figure's backend. Choose between plotly or matplotlib. - - create_figure: bool, default=True - Whether to create a new figure. - - """ - - _marker = ["circle", "x", "diamond", "pentagon", "star", "hexagon"] - _dash = [None, "dashdot", "dash", "dot", "longdash", "longdashdot"] - _shape = ["", "/", "x", "\\", "-", "|", "+", "."] - - def __init__( - self, - rows: INT = 1, - cols: INT = 1, - horizontal_spacing: FLOAT = 0.05, - vertical_spacing: FLOAT = 0.07, - palette: str | SEQUENCE = "Prism", - is_canvas: BOOL = False, - backend: str = "plotly", - create_figure: BOOL = True, - ): - self.rows = rows - self.cols = cols - self.horizontal_spacing = horizontal_spacing - self.vertical_spacing = vertical_spacing - if isinstance(palette, str): - self._palette = getattr(px.colors.qualitative, palette) - self.palette = cycle(self._palette) - else: - # Convert color names or hex to rgb - self._palette = list(map(to_rgb, palette)) - self.palette = cycle(self._palette) - self.is_canvas = is_canvas - self.backend = backend - self.create_figure = create_figure - - self.idx = 0 # N-th plot in the canvas - self.axes = 0 # N-th axis in the canvas - if self.create_figure: - if self.backend == "plotly": - self.figure = go.Figure() - else: - self.figure, _ = plt.subplots() - - self.groups = [] - self.style = dict(palette={}, marker={}, dash={}, shape={}) - self.marker = cycle(self._marker) - self.dash = cycle(self._dash) - self.shape = cycle(self._shape) - - self.pos = {} # Subplot position to use for title - self.custom_layout = {} # Layout params specified by user - self.used_models = [] # Models plotted in this figure - - # Perform parameter checks - if not 0 < horizontal_spacing < 1: - raise ValueError( - "Invalid value for the horizontal_spacing parameter. The " - f"value must lie between 0 and 1, got {horizontal_spacing}." - ) - - if not 0 < vertical_spacing < 1: - raise ValueError( - "Invalid value for the vertical_spacing parameter. The " - f"value must lie between 0 and 1, got {vertical_spacing}." - ) - - @property - def grid(self) -> tuple[INT, INT]: - """Position of the current axes on the grid. - - Returns - ------- - int - X-position. - - int - Y-position. - - """ - return (self.idx - 1) // self.cols + 1, self.idx % self.cols or self.cols - - @property - def next_subplot(self) -> go.Figure | plt.Figure | None: - """Increase the subplot index. - - Returns - ------- - go.Figure, plt.Figure or None - Current figure. Returns None if `create_figure=False`. - - """ - # Check if there are too many plots in the canvas - if self.idx >= self.rows * self.cols: - raise ValueError( - "Invalid number of plots in the canvas! Increase " - "the number of rows and cols to add more plots." - ) - else: - self.idx += 1 - - if self.create_figure: - return self.figure - - def get_elem(self, name: SCALAR | str | None = None, element: str = "palette") -> str: - """Get the plot element for a specific name. - - This method is used to assign the same element (color, marker, - etc...) to the same columns and models in a plot. - - Parameters - ---------- - name: int, float or str or None - Name for which to get the plot element. The name is stored in - the element attributes to assign the same element to all calls - with the same name. - - element: str, default="palette" - Plot element to get. Choose from: palette, marker, dash, shape. - - Returns - ------- - str - Element code. - - """ - if name is None: - return getattr(self, f"_{element}")[0] # Get first element (default) - elif name in self.style[element]: - return self.style[element][name] - else: - return self.style[element].setdefault(name, next(getattr(self, element))) - - def showlegend(self, name: str, legend: str | dict | None) -> BOOL: - """Get whether the trace should be showed in the legend. - - If there's already a trace with the same name, it's not - necessary to show it in the plot's legend. - - Parameters - ---------- - name: str - Name of the trace. - - legend: str, dict or None - Legend parameter. - - Returns - ------- - bool - Whether the trace should be placed in the legend. - - """ - if name in self.groups: - return False - else: - self.groups.append(name) - return legend is not None - - def get_axes( - self, - x: tuple[INT, INT] = (0, 1), - y: tuple[INT, INT] = (0, 1), - coloraxis: dict | None = None, - ) -> tuple[str, str]: - """Create and update the plot's axes. - - Parameters - ---------- - x: tuple of int - Relative x-size of the plot. - - y: tuple of int - Relative y-size of the plot. - - coloraxis: dict or None - Properties of the coloraxis to create. None to ignore. - - Returns - ------- - str - Name of the x-axis. - - str - Name of the y-axis. - - """ - self.axes += 1 - - # Calculate the distance between subplots - x_offset = divide(self.horizontal_spacing, (self.cols - 1)) - y_offset = divide(self.vertical_spacing, (self.rows - 1)) - - # Calculate the size of the subplot - x_size = (1 - ((x_offset * 2) * (self.cols - 1))) / self.cols - y_size = (1 - ((y_offset * 2) * (self.rows - 1))) / self.rows - - # Calculate the size of the axes - ax_size = (x[1] - x[0]) * x_size - ay_size = (y[1] - y[0]) * y_size - - # Determine the position for the axes - x_pos = (self.grid[1] - 1) * (x_size + 2 * x_offset) + x[0] * x_size - y_pos = (self.rows - self.grid[0]) * (y_size + 2 * y_offset) + y[0] * y_size - - # Store positions for subplot title - self.pos[str(self.axes)] = (x_pos + ax_size / 2, rnd(y_pos + ay_size)) - - # Update the figure with the new axes - self.figure.update_layout( - { - f"xaxis{self.axes}": dict( - domain=(x_pos, rnd(x_pos + ax_size)), anchor=f"y{self.axes}" - ), - f"yaxis{self.axes}": dict( - domain=(y_pos, rnd(y_pos + ay_size)), anchor=f"x{self.axes}" - ), - } - ) - - # Place a colorbar right of the axes - if coloraxis: - if title := coloraxis.pop("title", None): - coloraxis["colorbar_title"] = dict( - text=title, side="right", font_size=coloraxis.pop("font_size") - ) - - coloraxis["colorbar_x"] = rnd(x_pos + ax_size) + ax_size / 40 - coloraxis["colorbar_xanchor"] = "left" - coloraxis["colorbar_y"] = y_pos + ay_size / 2 - coloraxis["colorbar_yanchor"] = "middle" - coloraxis["colorbar_len"] = ay_size * 0.9 - coloraxis["colorbar_thickness"] = ax_size * 30 # Default width in pixels - self.figure.update_layout( - {f"coloraxis{coloraxis.pop('axes', self.axes)}": coloraxis} - ) - - xaxis = f"x{self.axes if self.axes > 1 else ''}" - yaxis = f"y{self.axes if self.axes > 1 else ''}" - return xaxis, yaxis - - -@typechecked -class BasePlot: - """Base class for all plotting methods. - - This base class defines the properties that can be changed - to customize the plot's aesthetics. - - """ - - _fig = None - _custom_layout = {} - _custom_traces = {} - _aesthetics = Aesthetics( - palette=list(PALETTE), - title_fontsize=24, - label_fontsize=16, - tick_fontsize=12, - line_width=2, - marker_size=8, - ) - - # Properties =================================================== >> - - @property - def aesthetics(self) -> dict: - """All plot aesthetic attributes.""" - return self._aesthetics - - @aesthetics.setter - def aesthetics(self, value: dict): - self.palette = value.get("palette", self.palette) - self.title_fontsize = value.get("title_fontsize", self.title_fontsize) - self.label_fontsize = value.get("label_fontsize", self.label_fontsize) - self.tick_fontsize = value.get("tick_fontsize", self.tick_fontsize) - self.line_width = value.get("line_width", self.line_width) - self.marker_size = value.get("marker_size", self.marker_size) - - @property - def palette(self) -> str | SEQUENCE: - """Color palette. - - Specify one of plotly's [built-in palettes][palette] or create - a custom one, e.g. `atom.palette = ["red", "green", "blue"]`. - - """ - return self._aesthetics.palette - - @palette.setter - def palette(self, value: str | SEQUENCE): - if isinstance(value, str) and not hasattr(px.colors.qualitative, value): - raise ValueError( - f"Invalid value for the palette parameter, got {value}. Choose " - f"from one of plotly's built-in qualitative color sequences in " - f"the px.colors.qualitative module or define your own sequence." - ) - - self._aesthetics.palette = value - - @property - def title_fontsize(self) -> INT: - """Fontsize for the plot's title.""" - return self._aesthetics.title_fontsize - - @title_fontsize.setter - def title_fontsize(self, value: INT): - if value <= 0: - raise ValueError( - "Invalid value for the title_fontsize parameter. " - f"Value should be >=0, got {value}." - ) - - self._aesthetics.title_fontsize = value - - @property - def label_fontsize(self) -> INT: - """Fontsize for the labels, legend and hover information.""" - return self._aesthetics.label_fontsize - - @label_fontsize.setter - def label_fontsize(self, value: INT): - if value <= 0: - raise ValueError( - "Invalid value for the label_fontsize parameter. " - f"Value should be >=0, got {value}." - ) - - self._aesthetics.label_fontsize = value - - @property - def tick_fontsize(self) -> INT: - """Fontsize for the ticks along the plot's axes.""" - return self._aesthetics.tick_fontsize - - @tick_fontsize.setter - def tick_fontsize(self, value: INT): - if value <= 0: - raise ValueError( - "Invalid value for the tick_fontsize parameter. " - f"Value should be >=0, got {value}." - ) - - self._aesthetics.tick_fontsize = value - - @property - def line_width(self) -> INT: - """Width of the line plots.""" - return self._aesthetics.line_width - - @line_width.setter - def line_width(self, value: INT): - if value <= 0: - raise ValueError( - "Invalid value for the line_width parameter. " - f"Value should be >=0, got {value}." - ) - - self._aesthetics.line_width = value - - @property - def marker_size(self) -> INT: - """Size of the markers.""" - return self._aesthetics.marker_size - - @marker_size.setter - def marker_size(self, value: INT): - if value <= 0: - raise ValueError( - "Invalid value for the marker_size parameter. " - f"Value should be >=0, got {value}." - ) - - self._aesthetics.marker_size = value - - # Methods ====================================================== >> - - @staticmethod - def _get_plot_index(df: DATAFRAME) -> INDEX: - """Return the dataset's index in a plottable format. - - Plotly does not accept all index formats (e.g. pd.Period), - thus use this utility method to convert to timestamp those - indices that can, else return as is. - - Parameters - ---------- - df: dataframe - Data set to get the index from. - - Returns - ------- - index - Index in an acceptable format. - - """ - if hasattr(df.index, "to_timestamp"): - return df.index.to_timestamp() - else: - return df.index - - @staticmethod - def _get_show(show: INT | None, model: MODEL | list[MODEL]) -> INT: - """Check and return the number of features to show. - - Parameters - ---------- - show: int or None - Number of features to show. If None, select all (max 200). - - model: Model or list - Models from which to get the features. - - Returns - ------- - int - Number of features to show. - - """ - max_fxs = max(m.n_features for m in lst(model)) - if show is None or show > max_fxs: - # Limit max features shown to avoid maximum figsize error - show = min(200, max_fxs) - elif show < 1: - raise ValueError( - f"Invalid value for the show parameter. Value should be >0, got {show}." - ) - - return show - - @staticmethod - def _get_hyperparams( - params: str | slice | SEQUENCE | None, - model: MODEL, - ) -> list[str]: - """Check and return a model's hyperparameters. - - Parameters - ---------- - params: str, slice, sequence or None - Hyperparameters to get. Use a sequence or add `+` between - options to select more than one. If None, all the model's - hyperparameters are selcted. - - model: Model - Get the params from this model. - - Returns - ------- - list of str - Selected hyperparameters. - - """ - if params is None: - hyperparameters = list(model._ht["distributions"]) - elif isinstance(params, slice): - hyperparameters = list(model._ht["distributions"])[params] - else: - hyperparameters = [] - for param in lst(params): - if isinstance(param, INT_TYPES): - hyperparameters.append(list(model._ht["distributions"])[param]) - elif isinstance(param, str): - for p in param.split("+"): - if p not in model._ht["distributions"]: - raise ValueError( - "Invalid value for the params parameter. " - f"Hyperparameter {p} was not used during the " - f"optimization of model {model.name}." - ) - else: - hyperparameters.append(p) - - if not hyperparameters: - raise ValueError(f"Didn't find any hyperparameters for model {model.name}.") - - return hyperparameters - - def _get_metric( - self, - metric: INT | str | SEQUENCE, - max_one: BOOL, - ) -> INT | str | list[INT]: - """Check and return the provided metric index. - - Parameters - ---------- - metric: int, str, sequence or None - Metric to retrieve. If None, all metrics are returned. - - max_one: bool - Whether one or multiple metrics are allowed. - - Returns - ------- - int or list - Position index of the metric. If `max_one=False`, returns - a list of metric positions. - - """ - if metric is None: - return list(range(len(self._metric))) - else: - inc = [] - for met in lst(metric): - if isinstance(met, INT_TYPES): - if 0 <= met < len(self._metric): - inc.append(met) - else: - raise ValueError( - f"Invalid value for the metric parameter. Value {met} is out " - f"of range for a pipeline with {len(self._metric)} metrics." - ) - elif isinstance(met, str): - met = met.lower() - for m in met.split("+"): - if m in ("time_ht", "time_fit", "time_bootstrap", "time"): - inc.append(m) - elif (name := get_custom_scorer(m).name) in self.metric: - inc.append(self._metric.index(name)) - else: - raise ValueError( - "Invalid value for the metric parameter. The " - f"{name} metric wasn't used to fit the models." - ) - - if len(inc) > 1 and max_one: - raise ValueError( - "Invalid value for the metric parameter. " - f"Only one metric is allowed, got {inc}." - ) - - return inc[0] if max_one else inc - - def _get_set( - self, - dataset: str | SEQUENCE, - max_one: BOOL, - allow_holdout: BOOL = True, - ) -> str | list[str]: - """Check and return the provided data set. - - Parameters - ---------- - dataset: str or sequence - Name(s) of the data set to retrieve. - - max_one: bool - Whether one or multiple data sets are allowed. If True, return - the data set instead of a list. - - allow_holdout: bool, default=True - Whether to allow the retrieval of the holdout set. - - Returns - ------- - str or list - Selected data set(s). - - """ - for ds in (dataset := "+".join(lst(dataset)).lower().split("+")): - if ds == "holdout": - if allow_holdout: - if self.holdout is None: - raise ValueError( - "Invalid value for the dataset parameter. No holdout " - "data set was specified when initializing the instance." - ) - else: - raise ValueError( - "Invalid value for the dataset parameter, got " - f"{ds}. Choose from: train, test." - ) - elif ds not in ("train", "test"): - raise ValueError( - "Invalid value for the dataset parameter, got {ds}. " - f"Choose from: train, test{', holdout' if allow_holdout else ''}." - ) - - if max_one and len(dataset) > 1: - raise ValueError( - "Invalid value for the dataset parameter, got " - f"{dataset}. Only one data set is allowed." - ) - - return dataset[0] if max_one else dataset - - def _get_figure(self, **kwargs) -> go.Figure | plt.Figure: - """Return existing figure if in canvas, else a new figure. - - Every time this method is called from a canvas, the plot - index is raised by one to keep track in which subplot the - BaseFigure is at. - - Parameters - ---------- - **kwargs - Additional keyword arguments for BaseFigure. - - Returns - ------- - [go.Figure][] or [plt.Figure][] - Existing figure or newly created. - - """ - if BasePlot._fig and BasePlot._fig.is_canvas: - return BasePlot._fig.next_subplot - else: - BasePlot._fig = BaseFigure(palette=self.palette, **kwargs) - return BasePlot._fig.next_subplot - - def _draw_line( - self, - parent: str, - child: str | None = None, - legend: str | dict = None, - **kwargs, - ) -> go.Scatter: - """Draw a line. - - Unify the style to draw a line, where parent and child - (e.g. model - data set or column - distribution) keep the - same style (color or dash). A legendgroup title is only added - when there is a child element. - - Parameters - ---------- - parent: str - Name of the model. - - child: str or None, default=None - Data set which is plotted. - - legend: str, dict or None - Legend argument provided by the user. - - **kwargs - Additional keyword arguments for the trace. - - Returns - ------- - go.Scatter - New trace to add to figure. - - """ - legendgrouptitle = dict(text=parent, font_size=self.label_fontsize) - hover = f"(%{{x}}, %{{y}}){parent}{f' - {child}' if child else ''}" - return go.Scatter( - line=dict( - width=self.line_width, - color=BasePlot._fig.get_elem(parent), - dash=BasePlot._fig.get_elem(child, "dash"), - ), - marker=dict( - symbol=BasePlot._fig.get_elem(child, "marker"), - size=self.marker_size, - color=BasePlot._fig.get_elem(parent), - line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), - ), - hovertemplate=kwargs.pop("hovertemplate", hover), - name=kwargs.pop("name", child or parent), - legendgroup=kwargs.pop("legendgroup", parent), - legendgrouptitle=legendgrouptitle if child else None, - showlegend=BasePlot._fig.showlegend(f"{parent}-{child}", legend), - **kwargs, - ) - - @staticmethod - def _draw_straight_line(y: SCALAR | str, xaxis: str, yaxis: str): - """Draw a line across the axis. - - The line can be either horizontal or diagonal. The line should - be used as reference. It's not added to the legend and doesn't - show any information on hover. - - Parameters - ---------- - y: int, float or str, default = "diagonal" - Coordinates on the y-axis. If a value, draw a horizontal line - at that value. If "diagonal", draw a diagonal line from x. - - xaxis: str - Name of the x-axis to draw in. - - yaxis: str - Name of the y-axis to draw in. - - """ - BasePlot._fig.figure.add_shape( - type="line", - x0=0, - x1=1, - y0=0 if y == "diagonal" else y, - y1=1 if y == "diagonal" else y, - xref=f"{xaxis} domain", - yref=f"{yaxis} domain" if y == "diagonal" else yaxis, - line=dict(width=1, color="black", dash="dash"), - opacity=0.6, - layer="below", - ) - - def _plot( - self, - fig: go.Figure | plt.Figure | None = None, - ax: plt.Axes | tuple[str, str] | None = None, - **kwargs, - ) -> go.Figure | plt.Figure | None: - """Make the plot. - - Customize the axes to the default layout and plot the figure - if it's not part of a canvas. - - Parameters - ---------- - fig: go.Figure, plt.Figure or None - Current figure. If None, use `plt.gcf()`. - - ax: plt.Axes, tuple or None, default=None - Axis object or names of the axes to update. If None, ignore - their update. - - **kwargs - Keyword arguments containing the figure's parameters. - - - title: Name of the title or custom configuration. - - legend: Whether to show the legend or custom configuration. - - xlabel: Label for the x-axis. - - ylabel: Label for the y-axis. - - xlim: Limits for the x-axis. - - ylim: Limits for the y-axis. - - figsize: Size of the figure. - - filename: Name of the saved file. - - plotname: Name of the plot. - - display: Whether to show the plot. If None, return the figure. - - Returns - ------- - plt.Figure, go.Figure or None - Created figure. Only returned if `display=None`. - - """ - # Set name with which to save the file - if kwargs.get("filename"): - if kwargs["filename"].endswith("auto"): - name = kwargs["filename"].replace("auto", kwargs["plotname"]) - else: - name = kwargs["filename"] - else: - name = kwargs.get("plotname") - - fig = fig or BasePlot._fig.figure - if BasePlot._fig.backend == "plotly": - if ax: - fig.update_layout( - { - f"{ax[0]}_title": dict( - text=kwargs.get("xlabel"), font_size=self.label_fontsize - ), - f"{ax[1]}_title": dict( - text=kwargs.get("ylabel"), font_size=self.label_fontsize - ), - f"{ax[0]}_range": kwargs.get("xlim"), - f"{ax[1]}_range": kwargs.get("ylim"), - f"{ax[0]}_automargin": True, - f"{ax[1]}_automargin": True, - } - ) - - if BasePlot._fig.is_canvas and (title := kwargs.get("title")): - # Add a subtitle to a plot in the canvas - default_title = { - "x": BasePlot._fig.pos[ax[0][5:] or "1"][0], - "y": BasePlot._fig.pos[ax[0][5:] or "1"][1] + 0.005, - "xref": "paper", - "yref": "paper", - "xanchor": "center", - "yanchor": "bottom", - "showarrow": False, - "font_size": self.title_fontsize - 4, - } - - if isinstance(title, dict): - title = {**default_title, **title} - else: - title = {"text": title, **default_title} - - fig.update_layout(dict(annotations=fig.layout.annotations + (title,))) - - if not BasePlot._fig.is_canvas and kwargs.get("plotname"): - default_title = dict( - x=0.5, - y=1, - pad=dict(t=15, b=15), - xanchor="center", - yanchor="top", - xref="paper", - font_size=self.title_fontsize, - ) - if isinstance(title := kwargs.get("title"), dict): - title = {**default_title, **title} - else: - title = {"text": title, **default_title} - - default_legend = dict( - traceorder="grouped", - groupclick=kwargs.get("groupclick", "toggleitem"), - font_size=self.label_fontsize, - bgcolor="rgba(255, 255, 255, 0.5)", - ) - if isinstance(legend := kwargs.get("legend"), str): - position = {} - legend = legend.lower() - if legend == "upper left": - position = dict(x=0.01, y=0.99, xanchor="left", yanchor="top") - elif legend == "lower left": - position = dict(x=0.01, y=0.01, xanchor="left", yanchor="bottom") - elif legend == "upper right": - position = dict(x=0.99, y=0.99, xanchor="right", yanchor="top") - elif legend == "lower right": - position = dict(x=0.99, y=0.01, xanchor="right", yanchor="bottom") - elif legend == "upper center": - position = dict(x=0.5, y=0.99, xanchor="center", yanchor="top") - elif legend == "lower center": - position = dict(x=0.5, y=0.01, xanchor="center", yanchor="bottom") - elif legend == "center left": - position = dict(x=0.01, y=0.5, xanchor="left", yanchor="middle") - elif legend == "center right": - position = dict(x=0.99, y=0.5, xanchor="right", yanchor="middle") - elif legend == "center": - position = dict(x=0.5, y=0.5, xanchor="center", yanchor="middle") - elif legend != "out": - raise ValueError( - "Invalid value for the legend parameter. Got unknown " - f"position: {legend}. Choose from: upper left, upper " - "right, lower left, lower right, upper center, lower " - "center, center left, center right, center, out." - ) - legend = {**default_legend, **position} - elif isinstance(legend, dict): - legend = {**default_legend, **legend} - - # Update layout with predefined settings - space1 = self.title_fontsize if title.get("text") else 10 - space2 = self.title_fontsize * int(bool(fig.layout.annotations)) - fig.update_layout( - title=title, - legend=legend, - showlegend=bool(kwargs.get("legend")), - hoverlabel=dict(font_size=self.label_fontsize), - font_size=self.tick_fontsize, - margin=dict(l=50, b=50, r=0, t=25 + space1 + space2, pad=0), - width=kwargs["figsize"][0], - height=kwargs["figsize"][1], - ) - - # Update plot with custom settings - fig.update_traces(**self._custom_traces) - fig.update_layout(**self._custom_layout) - - if kwargs.get("filename"): - if "." not in name or name.endswith(".html"): - fig.write_html(name if "." in name else name + ".html") - else: - fig.write_image(name) - - # Log plot to mlflow run of every model visualized - if getattr(self, "experiment", None) and self.log_plots: - for m in set(BasePlot._fig.used_models): - MlflowClient().log_figure( - run_id=m._run.info.run_id, - figure=fig, - artifact_file=name if "." in name else f"{name}.html", - ) - - if kwargs.get("display") is True: - fig.show() - elif kwargs.get("display") is None: - return fig - - else: - if kwargs.get("title"): - ax.set_title(kwargs.get("title"), fontsize=self.title_fontsize, pad=20) - if kwargs.get("xlabel"): - ax.set_xlabel(kwargs["xlabel"], fontsize=self.label_fontsize, labelpad=12) - if kwargs.get("ylabel"): - ax.set_ylabel(kwargs["ylabel"], fontsize=self.label_fontsize, labelpad=12) - if ax is not None: - ax.tick_params(axis="both", labelsize=self.tick_fontsize) - - if kwargs.get("figsize"): - # Convert from pixels to inches - fig.set_size_inches( - kwargs["figsize"][0] // fig.get_dpi(), - kwargs["figsize"][1] // fig.get_dpi(), - ) - plt.tight_layout() - if kwargs.get("filename"): - fig.savefig(name) - - # Log plot to mlflow run of every model visualized - if self.experiment and self.log_plots: - for m in set(BasePlot._fig.used_models): - MlflowClient().log_figure( - run_id=m._run.info.run_id, - figure=fig, - artifact_file=name if "." in name else f"{name}.png", - ) - - plt.show() if kwargs.get("display") else plt.close() - if kwargs.get("display") is None: - return fig - - @composed(contextmanager, crash) - def canvas( - self, - rows: INT = 1, - cols: INT = 2, - *, - horizontal_spacing: FLOAT = 0.05, - vertical_spacing: FLOAT = 0.07, - title: str | dict | None = None, - legend: str | dict | None = "out", - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: BOOL = True, - ): - """Create a figure with multiple plots. - - This `@contextmanager` allows you to draw many plots in one - figure. The default option is to add two plots side by side. - See the [user guide][canvas] for an example. - - Parameters - ---------- - rows: int, default=1 - Number of plots in length. - - cols: int, default=2 - Number of plots in width. - - horizontal_spacing: float, default=0.05 - Space between subplot rows in normalized plot coordinates. - The spacing is relative to the figure's size. - - vertical_spacing: float, default=0.07 - Space between subplot cols in normalized plot coordinates. - The spacing is relative to the figure's size. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: bool, str or dict, default="out" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of plots in the canvas. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool, default=True - Whether to render the plot. - - Yields - ------ - [go.Figure][] - Plot object. - - """ - BasePlot._fig = BaseFigure( - rows=rows, - cols=cols, - horizontal_spacing=horizontal_spacing, - vertical_spacing=vertical_spacing, - palette=self.palette, - is_canvas=True, - ) - - try: - yield BasePlot._fig.figure - finally: - BasePlot._fig.is_canvas = False # Close the canvas - self._plot( - groupclick="togglegroup", - title=title, - legend=legend, - figsize=figsize or (550 + 350 * cols, 200 + 400 * rows), - plotname="canvas", - filename=filename, - display=display, - ) - - def reset_aesthetics(self): - """Reset the plot [aesthetics][] to their default values.""" - self._custom_layout = {} - self._custom_traces = {} - self._aesthetics = Aesthetics( - palette=PALETTE, - title_fontsize=24, - label_fontsize=16, - tick_fontsize=12, - line_width=2, - marker_size=8, - ) - - def update_layout(self, **kwargs): - """Update the properties of the plot's layout. - - Recursively update the structure of the original layout with - the values in the arguments. - - Parameters - ---------- - **kwargs - Keyword arguments for the figure's [update_layout][] method. - - """ - self._custom_layout = kwargs - - def update_traces(self, **kwargs): - """Update the properties of the plot's traces. - - Recursively update the structure of the original traces with - the values in the arguments. - - Parameters - ---------- - **kwargs - Keyword arguments for the figure's [update_traces][] method. - - """ - self._custom_traces = kwargs - - -@typechecked -class FeatureSelectorPlot(BasePlot): - """Feature selection plots. - - These plots are accessible from atom or from the FeatureSelector - class when the appropriate feature selection strategy is used. - - """ - - @available_if(has_attr("pca")) - @crash - def plot_components( - self, - show: INT | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the explained variance ratio per component. - - Kept components are colored and discarted components are - transparent. This plot is available only when feature selection - was applied with strategy="pca". - - Parameters - ---------- - show: int or None, default=None - Number of components to show. None to show all. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of components shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:FeatureSelectorPlot.plot_pca - atom.plots:FeatureSelectorPlot.plot_rfecv - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.feature_selection("pca", n_features=5) - atom.plot_components(show=10) - ``` - - """ - if show is None or show > self.pca.components_.shape[0]: - # Limit max features shown to avoid maximum figsize error - show = min(200, self.pca.components_.shape[0]) - elif show < 1: - raise ValueError( - "Invalid value for the show parameter. " - f"Value should be >0, got {show}." - ) - - # Get the variance ratio per component - variance = np.array(self.pca.explained_variance_ratio_) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - # Create color scheme: first normal and then fully transparent - color = BasePlot._fig.get_elem("components") - opacity = [0.2] * self.pca._comps + [0] * (len(variance) - self.pca._comps) - - fig.add_trace( - go.Bar( - x=variance, - y=[f"pca{str(i)}" for i in range(len(variance))], - orientation="h", - marker=dict( - color=[f"rgba({color[4:-1]}, {o})" for o in opacity], - line=dict(width=2, color=color), - ), - hovertemplate="%{x}", - name=f"Variance retained: {variance[:self.pca._comps].sum():.3f}", - legendgroup="components", - showlegend=BasePlot._fig.showlegend("components", legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout({f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending")}) - - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Explained variance ratio", - ylim=(len(variance) - show - 0.5, len(variance) - 0.5), - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - plotname="plot_components", - filename=filename, - display=display, - ) - - @available_if(has_attr("pca")) - @crash - def plot_pca( - self, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the explained variance ratio vs number of components. - - If the underlying estimator is [PCA][] (for dense datasets), - all possible components are plotted. If the underlying estimator - is [TruncatedSVD][] (for sparse datasets), it only shows the - selected components. The star marks the number of components - selected by the user. This plot is available only when feature - selection was applied with strategy="pca". - - Parameters - ---------- - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:FeatureSelectorPlot.plot_components - atom.plots:FeatureSelectorPlot.plot_rfecv - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.feature_selection("pca", n_features=5) - atom.plot_pca() - ``` - - """ - # Create star symbol at selected number of components - symbols = ["circle"] * self.pca.n_features_in_ - symbols[self.pca._comps - 1] = "star" - sizes = [self.marker_size] * self.pca.n_features_in_ - sizes[self.pca._comps - 1] = self.marker_size * 1.5 - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - fig.add_trace( - go.Scatter( - x=tuple(range(1, self.pca.n_features_in_ + 1)), - y=np.cumsum(self.pca.explained_variance_ratio_), - mode="lines+markers", - line=dict(width=self.line_width, color=BasePlot._fig.get_elem("pca")), - marker=dict( - symbol=symbols, - size=sizes, - line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), - opacity=1, - ), - hovertemplate="%{y}", - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout( - { - "hovermode": "x", - f"xaxis{xaxis[1:]}_showspikes": True, - f"yaxis{yaxis[1:]}_showspikes": True, - } - ) - - margin = self.pca.n_features_in_ / 30 - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="First N principal components", - ylabel="Cumulative variance ratio", - xlim=(1 - margin, self.pca.n_features_in_ - 1 + margin), - title=title, - legend=legend, - figsize=figsize, - plotname="plot_pca", - filename=filename, - display=display, - ) - - @available_if(has_attr("rfecv")) - @crash - def plot_rfecv( - self, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the rfecv results. - - Plot the scores obtained by the estimator fitted on every - subset of the dataset. Only available when feature selection - was applied with strategy="rfecv". - - Parameters - ---------- - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:FeatureSelectorPlot.plot_components - atom.plots:FeatureSelectorPlot.plot_pca - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.feature_selection("rfecv", solver="Tree") - atom.plot_rfecv() - ``` - - """ - try: # Define the y-label for the plot - ylabel = self.rfecv.get_params()["scoring"].name - except AttributeError: - ylabel = "accuracy" if self.goal.startswith("class") else "r2" - - x = range(self.rfecv.min_features_to_select, self.rfecv.n_features_in_ + 1) - - # Create star symbol at selected number of features - sizes = [6] * len(x) - sizes[self.rfecv.n_features_ - self.rfecv.min_features_to_select] = 12 - symbols = ["circle"] * len(x) - symbols[self.rfecv.n_features_ - self.rfecv.min_features_to_select] = "star" - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - mean = self.rfecv.cv_results_["mean_test_score"] - std = self.rfecv.cv_results_["std_test_score"] - - fig.add_trace( - go.Scatter( - x=list(x), - y=mean, - mode="lines+markers", - line=dict(width=self.line_width, color=BasePlot._fig.get_elem("rfecv")), - marker=dict( - symbol=symbols, - size=sizes, - line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), - opacity=1, - ), - name=ylabel, - legendgroup="rfecv", - showlegend=BasePlot._fig.showlegend("rfecv", legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - # Add error bands - fig.add_traces( - [ - go.Scatter( - x=tuple(x), - y=mean + std, - mode="lines", - line=dict(width=1, color=BasePlot._fig.get_elem("rfecv")), - hovertemplate="%{y}upper bound", - legendgroup="rfecv", - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ), - go.Scatter( - x=tuple(x), - y=mean - std, - mode="lines", - line=dict(width=1, color=BasePlot._fig.get_elem("rfecv")), - fill="tonexty", - fillcolor=f"rgba{BasePlot._fig.get_elem('rfecv')[3:-1]}, 0.2)", - hovertemplate="%{y}lower bound", - legendgroup="rfecv", - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ), - ] - ) - - fig.update_layout({"hovermode": "x unified"}) - - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - groupclick="togglegroup", - xlabel="Number of features", - ylabel=ylabel, - xlim=(min(x) - len(x) / 30, max(x) + len(x) / 30), - ylim=(min(mean) - 3 * max(std), max(mean) + 3 * max(std)), - title=title, - legend=legend, - figsize=figsize, - plotname="plot_rfecv", - filename=filename, - display=display, - ) - - -@typechecked -class DataPlot(BasePlot): - """Data plots. - - Plots used for understanding and interpretation of the dataset. - They are only accessible from atom since. The other runners should - be used for model training only, not for data manipulation. - - """ - - @crash - def plot_correlation( - self, - columns: slice | SEQUENCE | None = None, - method: str = "pearson", - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] = (800, 700), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot a correlation matrix. - - Displays a heatmap showing the correlation between columns in - the dataset. The colors red, blue and white stand for positive, - negative, and no correlation respectively. - - Parameters - ---------- - columns: slice, sequence or None, default=None - Columns to plot. If None, plot all columns in the dataset. - Selected categorical columns are ignored. - - method: str, default="pearson" - Method of correlation. Choose from: pearson, kendall or - spearman. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple, default=(800, 700) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:DataPlot.plot_distribution - atom.plots:DataPlot.plot_qq - atom.plots:DataPlot.plot_relationships - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.plot_correlation() - ``` - - """ - columns = self.branch._get_columns(columns, only_numerical=True) - if method.lower() not in ("pearson", "kendall", "spearman"): - raise ValueError( - f"Invalid value for the method parameter, got {method}. " - "Choose from: pearson, kendall or spearman." - ) - - # Compute the correlation matrix - corr = self.dataset[columns].corr(method=method.lower()) - - # Generate a mask for the lower triangle - # k=1 means keep outermost diagonal line - mask = np.zeros_like(corr, dtype=bool) - mask[np.triu_indices_from(mask, k=1)] = True - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes( - x=(0, 0.87), - coloraxis=dict( - colorscale="rdbu_r", - cmin=-1, - cmax=1, - title=f"{method.lower()} correlation", - font_size=self.label_fontsize, - ), - ) - - fig.add_trace( - go.Heatmap( - z=corr.mask(mask), - x=columns, - y=columns, - coloraxis=f"coloraxis{xaxis[1:]}", - hovertemplate="x:%{x}
y:%{y}
z:%{z}", - hoverongaps=False, - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout( - { - "template": "plotly_white", - f"yaxis{yaxis[1:]}_autorange": "reversed", - f"xaxis{xaxis[1:]}_showgrid": False, - f"yaxis{yaxis[1:]}_showgrid": False, - } - ) - - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - title=title, - legend=legend, - figsize=figsize, - plotname="plot_correlation", - filename=filename, - display=display, - ) - - @crash - def plot_distribution( - self, - columns: SLICE = 0, - distributions: str | SEQUENCE | None = None, - show: INT | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper right", - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot column distributions. - - - For numerical columns, plot the probability density - distribution. Additionally, it's possible to plot any of - `scipy.stats` distributions fitted to the column. - - For categorical columns, plot the class distribution. - Only one categorical column can be plotted at the same time. - - !!! tip - Use atom's [distribution][atomclassifier-distribution] - method to check which distribution fits the column best. - - Parameters - ---------- - columns: int, str, slice or sequence, default=0 - Columns to plot. I's only possible to plot one categorical - column. If more than one categorical columns are selected, - all categorical columns are ignored. - - distributions: str, sequence or None, default=None - Names of the `scipy.stats` distributions to fit to the - columns. If None, a [Gaussian kde distribution][kde] is - showed. Only for numerical columns. - - show: int or None, default=None - Number of classes (ordered by number of occurrences) to - show in the plot. If None, it shows all classes. Only for - categorical columns. - - title: str, dict or None, default=None - Title for the plot. - - - If None: No title is shown. - - If str: Text for the title. - - If dict: [title configuration][parameters]. - - legend: str, dict or None, default="upper right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the plot's type. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:DataPlot.plot_correlation - atom.plots:DataPlot.plot_qq - atom.plots:DataPlot.plot_relationships - - Examples - -------- - ```pycon - import numpy as np - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - # Add a categorical feature - animals = ["cat", "dog", "bird", "lion", "zebra"] - probabilities = [0.001, 0.1, 0.2, 0.3, 0.399] - X["animals"] = np.random.choice(animals, size=len(X), p=probabilities) - - atom = ATOMClassifier(X, y, random_state=1) - atom.plot_distribution(columns=[0, 1]) - atom.plot_distribution(columns=0, distributions=["norm", "invgauss"]) - atom.plot_distribution(columns="animals") - ``` - - """ - columns = self.branch._get_columns(columns) - cat_columns = list(self.dataset.select_dtypes(exclude="number").columns) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - if len(columns) == 1 and columns[0] in cat_columns: - series = self.dataset[columns[0]].value_counts(ascending=True) - - if show is None or show > len(series): - show = len(series) - elif show < 1: - raise ValueError( - "Invalid value for the show parameter." - f"Value should be >0, got {show}." - ) - - color = BasePlot._fig.get_elem() - fig.add_trace( - go.Bar( - x=series, - y=series.index, - orientation="h", - marker=dict( - color=f"rgba({color[4:-1]}, 0.2)", - line=dict(width=2, color=color), - ), - hovertemplate="%{x}", - name=f"{columns[0]}: {len(series)} classes", - showlegend=BasePlot._fig.showlegend("dist", legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Counts", - ylim=(len(series) - show - 0.5, len(series) - 0.5), - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - plotname="plot_distribution", - filename=filename, - display=display, - ) - - else: - for col in [c for c in columns if c not in cat_columns]: - fig.add_trace( - go.Histogram( - x=self.dataset[col], - histnorm="probability density", - marker=dict( - color=f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)", - line=dict(width=2, color=BasePlot._fig.get_elem(col)), - ), - nbinsx=40, - name="dist", - legendgroup=col, - legendgrouptitle=dict(text=col, font_size=self.label_fontsize), - showlegend=BasePlot._fig.showlegend(f"{col}-dist", legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - x = np.linspace(self.dataset[col].min(), self.dataset[col].max(), 200) - - # Drop missing values for compatibility with scipy.stats - missing = self.missing + [np.inf, -np.inf] - values = self.dataset[col].replace(missing, np.NaN).dropna() - - if distributions: - # Get a line for each distribution - for j, dist in enumerate(lst(distributions)): - params = getattr(stats, dist).fit(values) - - fig.add_trace( - self._draw_line( - x=x, - y=getattr(stats, dist).pdf(x, *params), - parent=col, - child=dist, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - else: - # If no distributions specified, draw Gaussian kde - fig.add_trace( - self._draw_line( - x=x, - y=stats.gaussian_kde(values)(x), - parent=col, - child="kde", - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout(dict(barmode="overlay")) - - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Values", - ylabel="Probability density", - title=title, - legend=legend, - figsize=figsize or (900, 600), - plotname="plot_distribution", - filename=filename, - display=display, - ) - - @crash - def plot_ngrams( - self, - ngram: INT | str = "bigram", - index: SLICE | None = None, - show: INT = 10, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot n-gram frequencies. - - The text for the plot is extracted from the column named - `corpus`. If there is no column with that name, an exception - is raised. If the documents are not tokenized, the words are - separated by spaces. - - !!! tip - Use atom's [tokenize][atomclassifier-tokenize] method to - separate the words creating n-grams based on their frequency - in the corpus. - - Parameters - ---------- - ngram: str or int, default="bigram" - Number of contiguous words to search for (size of n-gram). - Choose from: words (1), bigrams (2), trigrams (3), - quadgrams (4). - - index: int, str, slice, sequence or None, default=None - Documents in the corpus to include in the search. If None, - it selects all documents in the dataset. - - show: int, default=10 - Number of n-grams (ordered by number of occurrences) to - show in the plot. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of n-grams shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:DataPlot.plot_wordcloud - - Examples - -------- - ```pycon - import numpy as np - from atom import ATOMClassifier - from sklearn.datasets import fetch_20newsgroups - - X, y = fetch_20newsgroups( - return_X_y=True, - categories=["alt.atheism", "sci.med", "comp.windows.x"], - shuffle=True, - random_state=1, - ) - X = np.array(X).reshape(-1, 1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.textclean() - atom.textnormalize() - atom.plot_ngrams() - ``` - - """ - - def get_text(column: SERIES) -> SERIES: - """Get the complete corpus as sequence of tokens. - - Parameters - ---------- - column: series - Column containing the corpus. - - Returns - ------- - series - Corpus of tokens. - - """ - if isinstance(column.iat[0], str): - return column.apply(lambda row: row.split()) - else: - return column - - corpus = get_corpus(self.X) - rows = self.dataset.loc[self.branch._get_rows(index, return_test=False)] - - if str(ngram).lower() in ("1", "word", "words"): - ngram = "words" - series = pd.Series( - [word for row in get_text(rows[corpus]) for word in row] - ).value_counts(ascending=True) - else: - if str(ngram).lower() in ("2", "bigram", "bigrams"): - ngram, finder = "bigrams", BigramCollocationFinder - elif str(ngram).lower() in ("3", "trigram", "trigrams"): - ngram, finder = "trigrams", TrigramCollocationFinder - elif str(ngram).lower() in ("4", "quadgram", "quadgrams"): - ngram, finder = "quadgrams", QuadgramCollocationFinder - else: - raise ValueError( - f"Invalid value for the ngram parameter, got {ngram}. " - "Choose from: words, bigram, trigram, quadgram." - ) - - ngram_fd = finder.from_documents(get_text(rows[corpus])).ngram_fd - series = pd.Series( - data=[x[1] for x in ngram_fd.items()], - index=[" ".join(x[0]) for x in ngram_fd.items()], - ).sort_values(ascending=True) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - fig.add_trace( - go.Bar( - x=(data := series[-show:]), - y=data.index, - orientation="h", - marker=dict( - color=f"rgba({BasePlot._fig.get_elem(ngram)[4:-1]}, 0.2)", - line=dict(width=2, color=BasePlot._fig.get_elem(ngram)), - ), - hovertemplate="%{x}", - name=f"Total {ngram}: {len(series)}", - legendgroup=ngram, - showlegend=BasePlot._fig.showlegend(ngram, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Counts", - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - plotname="plot_ngrams", - filename=filename, - display=display, - ) - - @crash - def plot_qq( - self, - columns: SLICE = 0, - distributions: str | SEQUENCE = "norm", - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot a quantile-quantile plot. - - Columns are distinguished by color and the distributions are - distinguished by marker type. Missing values are ignored. - - Parameters - ---------- - columns: int, str, slice or sequence, default=0 - Columns to plot. Selected categorical columns are ignored. - - distributions: str or sequence, default="norm" - Names of the `scipy.stats` distributions to fit to the - columns. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:DataPlot.plot_correlation - atom.plots:DataPlot.plot_distribution - atom.plots:DataPlot.plot_relationships - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.plot_qq(columns=[5, 6]) - atom.plot_qq(columns=0, distributions=["norm", "invgauss", "triang"]) - ``` - - """ - columns = self.branch._get_columns(columns) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - percentiles = np.linspace(0, 100, 101) - for col in columns: - # Drop missing values for compatibility with scipy.stats - missing = self.missing + [np.inf, -np.inf] - values = self.dataset[col].replace(missing, np.NaN).dropna() - - for dist in lst(distributions): - stat = getattr(stats, dist) - params = stat.fit(values) - samples = stat.rvs(*params, size=101, random_state=self.random_state) - - fig.add_trace( - self._draw_line( - x=np.percentile(samples, percentiles), - y=np.percentile(values, percentiles), - mode="markers", - parent=col, - child=dist, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis) - - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Theoretical quantiles", - ylabel="Observed quantiles", - title=title, - legend=legend, - figsize=figsize or (900, 600), - plotname="plot_qq", - filename=filename, - display=display, - ) - - @crash - def plot_relationships( - self, - columns: slice | SEQUENCE = (0, 1, 2), - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] = (900, 900), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot pairwise relationships in a dataset. - - Creates a grid of axes such that each numerical column appears - once on the x-axes and once on the y-axes. The bottom triangle - contains scatter plots (max 250 random samples), the diagonal - plots contain column distributions, and the upper triangle - contains contour histograms for all samples in the columns. - - Parameters - ---------- - columns: slice or sequence, default=(0, 1, 2) - Columns to plot. Selected categorical columns are ignored. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple, default=(900, 900) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:DataPlot.plot_correlation - atom.plots:DataPlot.plot_distribution - atom.plots:DataPlot.plot_qq - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.plot_relationships(columns=[0, 4, 5]) - ``` - - """ - columns = self.branch._get_columns(columns, only_numerical=True) - - # Use max 250 samples to not clutter the plot - sample = lambda col: self.dataset[col].sample( - n=min(len(self.dataset), 250), random_state=self.random_state - ) - - fig = self._get_figure() - color = BasePlot._fig.get_elem() - for i in range(len(columns)**2): - x, y = i // len(columns), i % len(columns) - - # Calculate the distance between subplots - offset = divide(0.0125, (len(columns) - 1)) - - # Calculate the size of the subplot - size = (1 - ((offset * 2) * (len(columns) - 1))) / len(columns) - - # Determine the position for the axes - x_pos = y * (size + 2 * offset) - y_pos = (len(columns) - x - 1) * (size + 2 * offset) - - xaxis, yaxis = BasePlot._fig.get_axes( - x=(x_pos, rnd(x_pos + size)), - y=(y_pos, rnd(y_pos + size)), - coloraxis=dict( - colorscale=PALETTE.get(color, "Blues"), - cmin=0, - cmax=len(self.dataset), - showscale=False, - ) - ) - - if x == y: - fig.add_trace( - go.Histogram( - x=self.dataset[columns[x]], - marker=dict( - color=f"rgba({color[4:-1]}, 0.2)", - line=dict(width=2, color=color), - ), - name=columns[x], - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - elif x > y: - fig.add_trace( - go.Scatter( - x=sample(columns[y]), - y=sample(columns[x]), - mode="markers", - marker=dict(color=color), - hovertemplate="(%{x}, %{y})", - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - elif y > x: - fig.add_trace( - go.Histogram2dContour( - x=self.dataset[columns[y]], - y=self.dataset[columns[x]], - coloraxis=f"coloraxis{xaxis[1:]}", - hovertemplate="x:%{x}
y:%{y}
z:%{z}", - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - if x < len(columns) - 1: - fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False}) - if y > 0: - fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False}) - - self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel=columns[y] if x == len(columns) - 1 else None, - ylabel=columns[x] if y == 0 else None, - ) - - return self._plot( - title=title, - legend=legend, - figsize=figsize or (900, 900), - plotname="plot_relationships", - filename=filename, - display=display, - ) - - @crash - def plot_wordcloud( - self, - index: SLICE | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - **kwargs, - ) -> go.Figure | None: - """Plot a wordcloud from the corpus. - - The text for the plot is extracted from the column named - `corpus`. If there is no column with that name, an exception - is raised. - - Parameters - ---------- - index: int, str, slice, sequence or None, default=None - Documents in the corpus to include in the wordcloud. If - None, it selects all documents in the dataset. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - **kwargs - Additional keyword arguments for the [Wordcloud][] object. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:DataPlot.plot_ngrams - atom.plots:PredictionPlot.plot_pipeline - - Examples - -------- - ```pycon - import numpy as np - from atom import ATOMClassifier - from sklearn.datasets import fetch_20newsgroups - - X, y = fetch_20newsgroups( - return_X_y=True, - categories=["alt.atheism", "sci.med", "comp.windows.x"], - shuffle=True, - random_state=1, - ) - X = np.array(X).reshape(-1, 1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.textclean() - atom.textnormalize() - atom.plot_wordcloud() - ``` - - """ - - def get_text(column): - """Get the complete corpus as one long string.""" - if isinstance(column.iat[0], str): - return " ".join(column) - else: - return " ".join([" ".join(row) for row in column]) - - check_dependency("wordcloud") - from wordcloud import WordCloud - - corpus = get_corpus(self.X) - rows = self.dataset.loc[self.branch._get_rows(index, return_test=False)] - - wordcloud = WordCloud( - width=figsize[0], - height=figsize[1], - background_color=kwargs.pop("background_color", "white"), - random_state=kwargs.pop("random_state", self.random_state), - **kwargs, - ) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - fig.add_trace( - go.Image( - z=wordcloud.generate(get_text(rows[corpus])), - hoverinfo="skip", - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout( - { - f"xaxis{xaxis[1:]}_showticklabels": False, - f"yaxis{xaxis[1:]}_showticklabels": False, - } - ) - - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - title=title, - legend=legend, - figsize=figsize or (900, 600), - plotname="plot_wordcloud", - filename=filename, - display=display, - ) - - -@typechecked -class HTPlot(BasePlot): - """Hyperparameter tuning plots. - - Plots that help interpret the model's study and corresponding - trials. These plots are accessible from the runners or from the - models. If called from a runner, the `models` parameter has to be - specified (if None, uses all models). If called from a model, that - model is used and the `models` parameter becomes unavailable. - - """ - - @composed(crash, plot_from_model) - def plot_edf( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - metric: INT | str | SEQUENCE | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper left", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the Empirical Distribution Function of a study. - - Use this plot to analyze and improve hyperparameter search - spaces. The EDF assumes that the value of the objective - function is in accordance with the uniform distribution over - the objective space. This plot is only available for models - that ran [hyperparameter tuning][]. - - !!! note - Only complete trials are considered when plotting the EDF. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models that used hyperparameter - tuning are selected. - - metric: int, str, sequence or None, default=None - Metric to plot (only for multi-metric runs). If str, add `+` - between options to select more than one. If None, the metric - used to run the pipeline is selected. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper left" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:HTPlot.plot_hyperparameters - atom.plots:HTPlot.plot_trials - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from optuna.distributions import IntDistribution - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - - # Run three models with different search spaces - atom.run( - models="RF_1", - n_trials=10, - ht_params={"distributions": {"n_estimators": IntDistribution(6, 10)}}, - ) - atom.run( - models="RF_2", - n_trials=10, - ht_params={"distributions": {"n_estimators": IntDistribution(11, 15)}}, - ) - atom.run( - models="RF_3", - n_trials=10, - ht_params={"distributions": {"n_estimators": IntDistribution(16, 20)}}, - ) - - atom.plot_edf() - ``` - - """ - models = check_hyperparams(models, "plot_edf") - metric = self._get_metric(metric, max_one=False) - - values = [] - for m in models: - values.append([]) - for met in metric: - values[-1].append(np.array([lst(row)[met] for row in m.trials["score"]])) - - x_min = np.nanmin(np.array(values)) - x_max = np.nanmax(np.array(values)) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m, val in zip(models, values): - for met in metric: - fig.add_trace( - self._draw_line( - x=(x := np.linspace(x_min, x_max, 100)), - y=np.sum(val[met][:, np.newaxis] <= x, axis=0) / len(val[met]), - parent=m.name, - child=self._metric[met].name, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - ylim=(0, 1), - xlabel="Score", - ylabel="Cumulative Probability", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_edf", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model) - def plot_hyperparameter_importance( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - metric: int | str = 0, - show: INT | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot a model's hyperparameter importance. - - The hyperparameter importance are calculated using the - [fANOVA][] importance evaluator. The sum of importances for all - parameters (per model) is 1. This plot is only available for - models that ran [hyperparameter tuning][]. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models that used hyperparameter - tuning are selected. - - metric: int or str, default=0 - Metric to plot (only for multi-metric runs). - - show: int or None, default=None - Number of hyperparameters (ordered by importance) to show. - None to show all. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of hyperparameters shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_feature_importance - atom.plots:HTPlot.plot_hyperparameters - atom.plots:HTPlot.plot_trials - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["ET", "RF"], n_trials=10) - atom.plot_hyperparameter_importance() - ``` - - """ - models = check_hyperparams(models, "plot_hyperparameter_importance") - params = len(set([k for m in lst(models) for k in m._ht["distributions"]])) - met = self._get_metric(metric, max_one=True) - - if show is None or show > params: - # Limit max features shown to avoid maximum figsize error - show = min(200, params) - elif show < 1: - raise ValueError( - f"Invalid value for the show parameter. Value should be >0, got {show}." - ) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - importances = FanovaImportanceEvaluator(seed=self.random_state).evaluate( - study=m.study, - target=None if len(self._metric) == 1 else lambda x: x.values[met], - ) - - fig.add_trace( - go.Bar( - x=np.array(list(importances.values())) / sum(importances.values()), - y=list(importances.keys()), - orientation="h", - marker=dict( - color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", - line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), - ), - hovertemplate="%{x}", - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout( - { - f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"), - "bargroupgap": 0.05, - } - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Normalized hyperparameter importance", - ylim=(params - show - 0.5, params - 0.5), - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - plotname="plot_hyperparameter_importance", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(max_one=True)) - def plot_hyperparameters( - self, - models: INT | str | MODEL | None = None, - params: str | slice | SEQUENCE = (0, 1), - metric: int | str = 0, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot hyperparameter relationships in a study. - - A model's hyperparameters are plotted against each other. The - corresponding metric scores are displayed in a contour plot. - The markers are the trials in the study. This plot is only - available for models that ran [hyperparameter tuning][]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_hyperparameters()`. - - params: str, slice or sequence, default=(0, 1) - Hyperparameters to plot. Use a sequence or add `+` between - options to select more than one. - - metric: int or str, default=0 - Metric to plot (only for multi-metric runs). - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of hyperparameters shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:HTPlot.plot_hyperparameter_importance - atom.plots:HTPlot.plot_parallel_coordinate - atom.plots:HTPlot.plot_trials - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run("LR", n_trials=15) - atom.plot_hyperparameters(params=(0, 1, 2)) - ``` - - """ - m = check_hyperparams(models, "plot_hyperparameters")[0] - - if len(params := self._get_hyperparams(params, models)) < 2: - raise ValueError( - "Invalid value for the hyperparameters parameter. A minimum " - f"of two parameters is required, got {len(params)}." - ) - - met = self._get_metric(metric, max_one=True) - - fig = self._get_figure() - for i in range((length := len(params) - 1) ** 2): - x, y = i // length, i % length - - if y <= x: - # Calculate the size of the subplot - size = 1 / length - - # Determine the position for the axes - x_pos = y * size - y_pos = (length - x - 1) * size - - xaxis, yaxis = BasePlot._fig.get_axes( - x=(x_pos, rnd(x_pos + size)), - y=(y_pos, rnd(y_pos + size)), - coloraxis=dict( - axes="99", - colorscale=PALETTE.get(BasePlot._fig.get_elem(m.name), "Blues"), - cmin=np.nanmin( - m.trials.apply(lambda x: lst(x["score"])[met], axis=1) - ), - cmax=np.nanmax( - m.trials.apply(lambda x: lst(x["score"])[met], axis=1) - ), - showscale=False, - ) - ) - - x_values = lambda row: row["params"].get(params[y], None) - y_values = lambda row: row["params"].get(params[x + 1], None) - - fig.add_trace( - go.Scatter( - x=m.trials.apply(x_values, axis=1), - y=m.trials.apply(y_values, axis=1), - mode="markers", - marker=dict( - size=self.marker_size, - color=BasePlot._fig.get_elem(m.name), - line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), - ), - customdata=list( - zip( - m.trials.index.tolist(), - m.trials.apply(lambda x: lst(x["score"])[met], axis=1), - ) - ), - hovertemplate=( - f"{params[y]}:%{{x}}
" - f"{params[x + 1]}:%{{y}}
" - f"{self._metric[met].name}:%{{customdata[1]:.4f}}" - "Trial %{customdata[0]}" - ), - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.add_trace( - go.Contour( - x=m.trials.apply(x_values, axis=1), - y=m.trials.apply(y_values, axis=1), - z=m.trials.apply(lambda i: lst(i["score"])[met], axis=1), - contours=dict( - showlabels=True, - labelfont=dict(size=self.tick_fontsize, color="white") - ), - coloraxis="coloraxis99", - hoverinfo="skip", - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - if _is_log_scale(m.study.trials, params[y]): - fig.update_layout({f"xaxis{xaxis[1:]}_type": "log"}) - if _is_log_scale(m.study.trials, params[x + 1]): - fig.update_layout({f"yaxis{xaxis[1:]}_type": "log"}) - - if x < length - 1: - fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False}) - if y > 0: - fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False}) - - fig.update_layout( - { - "template": "plotly_white", - f"xaxis{xaxis[1:]}_showgrid": False, - f"yaxis{yaxis[1:]}_showgrid": False, - f"xaxis{yaxis[1:]}_zeroline": False, - f"yaxis{yaxis[1:]}_zeroline": False, - } - ) - - self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel=params[y] if x == length - 1 else None, - ylabel=params[x + 1] if y == 0 else None, - ) - - BasePlot._fig.used_models.append(m) - return self._plot( - title=title, - legend=legend, - figsize=figsize or (800 + 100 * length, 500 + 100 * length), - plotname="plot_hyperparameters", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(max_one=True)) - def plot_parallel_coordinate( - self, - models: INT | str | MODEL | None = None, - params: str | slice | SEQUENCE | None = None, - metric: INT | str = 0, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot high-dimensional parameter relationships in a study. - - Every line of the plot represents one trial. This plot is only - available for models that ran [hyperparameter tuning][]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_parallel_coordinate()`. - - params: str, slice, sequence or None, default=None - Hyperparameters to plot. Use a sequence or add `+` between - options to select more than one. If None, all the model's - hyperparameters are selected. - - metric: int or str, default=0 - Metric to plot (only for multi-metric runs). - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of hyperparameters shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:HTPlot.plot_edf - atom.plots:HTPlot.plot_hyperparameter_importance - atom.plots:HTPlot.plot_hyperparameters - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run("RF", n_trials=15) - atom.plot_parallel_coordinate(params=slice(1, 5)) - ``` - - """ - - def sort_mixed_types(values: list[str]) -> list[str]: - """Sort a sequence of numbers and strings. - - Numbers are converted and take precedence over strings. - - Parameters - ---------- - values: list - Values to sort. - - Returns - ------- - list of str - Sorted values. - - """ - numbers, categorical = [], [] - for elem in values: - try: - numbers.append(it(float(elem))) - except (TypeError, ValueError): - categorical.append(str(elem)) - - return list(map(str, sorted(numbers))) + sorted(categorical) - - m = check_hyperparams(models, "plot_parallel_coordinate")[0] - params = self._get_hyperparams(params, models) - met = self._get_metric(metric, max_one=True) - - dims = _get_dims_from_info( - _get_parallel_coordinate_info( - study=m.study, - params=params, - target=None if len(self._metric) == 1 else lambda x: x.values[met], - target_name=self._metric[met].name, - ) - ) - - # Clean up dimensions for nicer view - for d in [dims[0]] + sorted(dims[1:], key=lambda x: params.index(x["label"])): - if "ticktext" in d: - # Skip processing for logarithmic params - if all(isinstance(i, INT_TYPES) for i in d["values"]): - # Order categorical values - mapping = [d["ticktext"][i] for i in d["values"]] - d["ticktext"] = sort_mixed_types(d["ticktext"]) - d["values"] = [d["ticktext"].index(v) for v in mapping] - else: - # Round numerical values - d["tickvals"] = list( - map(rnd, np.linspace(min(d["values"]), max(d["values"]), 5)) - ) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes( - coloraxis=dict( - colorscale=PALETTE.get(BasePlot._fig.get_elem(m.name), "Blues"), - cmin=min(dims[0]["values"]), - cmax=max(dims[0]["values"]), - title=self._metric[met].name, - font_size=self.label_fontsize, - ) - ) - - fig.add_trace( - go.Parcoords( - dimensions=dims, - line=dict( - color=dims[0]["values"], - coloraxis=f"coloraxis{xaxis[1:]}", - ), - unselected=dict(line=dict(color="gray", opacity=0.5)), - labelside="bottom", - labelfont=dict(size=self.label_fontsize), - ) - ) - - BasePlot._fig.used_models.append(m) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - title=title, - legend=legend, - figsize=figsize or (700 + len(params) * 50, 600), - plotname="plot_parallel_coordinate", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(max_one=True)) - def plot_pareto_front( - self, - models: INT | str | MODEL | None = None, - metric: str | SEQUENCE | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the Pareto front of a study. - - Shows the trial scores plotted against each other. The marker's - colors indicate the trial number. This plot is only available - for models that ran [multi-metric runs][] with - [hyperparameter tuning][]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_pareto_front()`. - - metric: str, sequence or None, default=None - Metrics to plot. Use a sequence or add `+` between options - to select more than one. If None, the metrics used to run - the pipeline are selected. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of metrics shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:HTPlot.plot_edf - atom.plots:HTPlot.plot_slice - atom.plots:HTPlot.plot_trials - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run( - models="RF", - metric=["f1", "accuracy", "recall"], - n_trials=15, - ) - atom.plot_pareto_front() - ``` - - """ - m = check_hyperparams(models, "plot_pareto_front")[0] - - if len(metric := self._get_metric(metric, max_one=False)) < 2: - raise ValueError( - "Invalid value for the metric parameter. A minimum " - f"of two metrics are required, got {len(metric)}." - ) - - fig = self._get_figure() - for i in range((length := len(metric) - 1) ** 2): - x, y = i // length, i % length - - if y <= x: - # Calculate the distance between subplots - offset = divide(0.0125, length - 1) - - # Calculate the size of the subplot - size = (1 - ((offset * 2) * (length - 1))) / length - - # Determine the position for the axes - x_pos = y * (size + 2 * offset) - y_pos = (length - x - 1) * (size + 2 * offset) - - xaxis, yaxis = BasePlot._fig.get_axes( - x=(x_pos, rnd(x_pos + size)), - y=(y_pos, rnd(y_pos + size)), - ) - - fig.add_trace( - go.Scatter( - x=m.trials.apply(lambda row: row["score"][y], axis=1), - y=m.trials.apply(lambda row: row["score"][x + 1], axis=1), - mode="markers", - marker=dict( - size=self.marker_size, - color=m.trials.index, - colorscale="Teal", - line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), - ), - customdata=m.trials.index, - hovertemplate="(%{x}, %{y})Trial %{customdata}", - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - if x < len(metric) - 1: - fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False}) - if y > 0: - fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False}) - - self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel=self._metric[y].name if x == length - 1 else None, - ylabel=self._metric[x + 1].name if y == 0 else None, - ) - - BasePlot._fig.used_models.append(m) - return self._plot( - title=title, - legend=legend, - figsize=figsize or (500 + 100 * length, 500 + 100 * length), - plotname="plot_pareto_front", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(max_one=True)) - def plot_slice( - self, - models: INT | str | MODEL | None = None, - params: str | slice | SEQUENCE | None = None, - metric: INT | str | SEQUENCE | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the parameter relationship in a study. - - The color of the markers indicate the trial. This plot is only - available for models that ran [hyperparameter tuning][]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_slice()`. - - params: str, slice, sequence or None, default=None - Hyperparameters to plot. Use a sequence or add `+` between - options to select more than one. If None, all the model's - hyperparameters are selected. - - metric: int or str, default=None - Metric to plot (only for multi-metric runs). If str, add `+` - between options to select more than one. If None, the metric - used to run the pipeline is selected. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of hyperparameters shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:HTPlot.plot_edf - atom.plots:HTPlot.plot_hyperparameters - atom.plots:HTPlot.plot_parallel_coordinate - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run( - models="RF", - metric=["f1", "recall"], - n_trials=15, - ) - atom.plot_slice(params=(0, 1, 2)) - ``` - - """ - m = check_hyperparams(models, "plot_slice")[0] - params = self._get_hyperparams(params, models) - metric = self._get_metric(metric, max_one=False) - - fig = self._get_figure() - for i in range(len(params) * len(metric)): - x, y = i // len(params), i % len(params) - - # Calculate the distance between subplots - x_offset = divide(0.0125, (len(params) - 1)) - y_offset = divide(0.0125, (len(metric) - 1)) - - # Calculate the size of the subplot - x_size = (1 - ((x_offset * 2) * (len(params) - 1))) / len(params) - y_size = (1 - ((y_offset * 2) * (len(metric) - 1))) / len(metric) - - # Determine the position for the axes - x_pos = y * (x_size + 2 * x_offset) - y_pos = (len(metric) - x - 1) * (y_size + 2 * y_offset) - - xaxis, yaxis = BasePlot._fig.get_axes( - x=(x_pos, rnd(x_pos + x_size)), - y=(y_pos, rnd(y_pos + y_size)), - ) - - fig.add_trace( - go.Scatter( - x=m.trials.apply(lambda r: r["params"].get(params[y], None), axis=1), - y=m.trials.apply(lambda r: lst(r["score"])[x], axis=1), - mode="markers", - marker=dict( - size=self.marker_size, - color=m.trials.index, - colorscale="Teal", - line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), - ), - customdata=m.trials.index, - hovertemplate="(%{x}, %{y})Trial %{customdata}", - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - if _is_log_scale(m.study.trials, params[y]): - fig.update_layout({f"xaxis{xaxis[1:]}_type": "log"}) - - if x < len(metric) - 1: - fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False}) - if y > 0: - fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False}) - - self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel=params[y] if x == len(metric) - 1 else None, - ylabel=self._metric[x].name if y == 0 else None, - ) - - BasePlot._fig.used_models.append(m) - return self._plot( - title=title, - legend=legend, - figsize=figsize or (800 + 100 * len(params), 500 + 100 * len(metric)), - plotname="plot_slice", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model) - def plot_terminator_improvement( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the potentials for future objective improvement. - - This function visualizes the objective improvement potentials. - It helps to determine whether you should continue the - optimization or not. The evaluated error is also plotted. Note - that this function may take some time to compute the improvement - potentials. This plot is only available for models that ran - [hyperparameter tuning][]. - - !!! warning - * The plot_terminator_improvement method is only available - for models that ran [hyperparameter tuning][] using - cross-validation, e.g. using `ht_params={'cv': 5}`. - * This method can be slow. Results are cached to fasten - repeated calls. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models that used hyperparameter - tuning are selected. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper right", - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y) - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:HTPlot.plot_pareto_front - atom.plots:HTPlot.plot_timeline - atom.plots:HTPlot.plot_trials - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run("RF", n_trials=10, ht_params={"cv": 5}) - atom.plot_terminator_improvement() - ``` - - """ - check_dependency("botorch") - - models = check_hyperparams(models, "plot_terminator_improvement") - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - if m._ht["cv"] > 1: - info = self._memory.cache(_get_improvement_info)(m.study, get_error=True) - else: - raise ValueError( - "The plot_terminator_improvement method is only available for " - "models that ran hyperparameter tuning using cross-validation, " - "e.g. using ht_params={'cv': 5}." - ) - - fig.add_trace( - self._draw_line( - x=m.trials.index, - y=info.improvements, - error_y=dict(type="data", array=info.errors), - mode="markers+lines", - parent=m.name, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Trial", - ylabel="Terminator improvement", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_terminator_improvement", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model) - def plot_timeline( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the timeline of a study. - - This plot is only available for models that ran - [hyperparameter tuning][]. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models that used hyperparameter - tuning are selected. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right", - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y) - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:HTPlot.plot_edf - atom.plots:HTPlot.plot_slice - atom.plots:HTPlot.plot_terminator_improvement - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from optuna.pruners import PatientPruner - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run( - models="LGB", - n_trials=15, - ht_params={"pruner": PatientPruner(None, patience=2)}, - ) - atom.plot_timeline() - ``` - - """ - models = check_hyperparams(models, "plot_timeline") - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - _cm = { - "COMPLETE": BasePlot._fig._palette[0], # Main color - "FAIL": "rgb(255, 0, 0)", # Red - "PRUNED": "rgb(255, 165, 0)", # Orange - "RUNNING": "rgb(124, 252, 0)", # Green - "WAITING": "rgb(220, 220, 220)", # Gray - } - - for m in models: - info = [] - for trial in m.study.get_trials(deepcopy=False): - date_complete = trial.datetime_complete or datetime.now() - date_start = trial.datetime_start or date_complete - - # Create nice representation of scores and params for hover - s = [f'{m}: {trial.values[i]}' for i, m in enumerate(self._metric.keys())] - p = [f" --> {k}: {v}" for k, v in trial.params.items()] - - info.append( - Bunch( - number=trial.number, - start=date_start, - duration=1000 * (date_complete - date_start).total_seconds(), - state=trial.state, - hovertext=( - f"Trial: {trial.number}
" - f"{'
'.join(s)}" - f"Parameters:
{'
'.join(p)}" - ) - ) - ) - - for state in sorted(TrialState, key=lambda x: x.name): - if bars := list(filter(lambda x: x.state == state, info)): - fig.add_trace( - go.Bar( - name=state.name, - x=[b.duration for b in bars], - y=[b.number for b in bars], - base=[b.start.isoformat() for b in bars], - text=[b.hovertext for b in bars], - textposition="none", - hovertemplate=f"%{{text}}{m.name}", - orientation="h", - marker=dict( - color=f"rgba({_cm[state.name][4:-1]}, 0.2)", - line=dict(width=2, color=_cm[state.name]), - ), - showlegend=BasePlot._fig.showlegend(_cm[state.name], legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout({f"xaxis{yaxis[1:]}_type": "date", "barmode": "group"}) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Datetime", - ylabel="Trial", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_timeline", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model) - def plot_trials( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - metric: INT | str | SEQUENCE | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper left", - figsize: tuple[INT, INT] = (900, 800), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the hyperparameter tuning trials. - - Creates a figure with two plots: the first plot shows the score - of every trial and the second shows the distance between the - last consecutive steps. The best trial is indicated with a star. - This is the same plot as produced by `ht_params={"plot": True}`. - This plot is only available for models that ran - [hyperparameter tuning][]. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models that used hyperparameter - tuning are selected. - - metric: int, str, sequence or None, default=None - Metric to plot (only for multi-metric runs). Add `+` between - options to select more than one. If None, all metrics are - selected. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper left" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 800) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_evals - atom.plots:HTPlot.plot_hyperparameters - atom.plots:PredictionPlot.plot_results - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["ET", "RF"], n_trials=15) - atom.plot_trials() - ``` - - """ - models = check_hyperparams(models, "plot_trials") - metric = self._get_metric(metric, max_one=False) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes(y=(0.31, 1.0)) - xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.0, 0.29)) - for m in models: - for met in metric: - y = m.trials["score"].apply(lambda value: lst(value)[met]) - - # Create star symbol at best trial - symbols = ["circle"] * len(y) - symbols[m.best_trial.number] = "star" - sizes = [self.marker_size] * len(y) - sizes[m.best_trial.number] = self.marker_size * 1.5 - - fig.add_trace( - self._draw_line( - x=list(range(len(y))), - y=y, - mode="lines+markers", - marker_symbol=symbols, - marker_size=sizes, - hovertemplate=None, - parent=m.name, - child=self._metric[met].name, - legend=legend, - xaxis=xaxis2, - yaxis=yaxis, - ) - ) - - fig.add_trace( - self._draw_line( - x=list(range(1, len(y))), - y=np.abs(np.diff(y)), - mode="lines+markers", - marker_symbol="circle", - parent=m.name, - child=self._metric[met].name, - legend=legend, - xaxis=xaxis2, - yaxis=yaxis2, - ) - ) - - fig.update_layout( - { - f"yaxis{yaxis[1:]}_anchor": f"x{xaxis2[1:]}", - f"xaxis{xaxis[1:]}_showticklabels": False, - "hovermode": "x unified", - }, - ) - - self._plot( - ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"), - xlabel="Trial", - ylabel="d", - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - groupclick="togglegroup", - ylabel="Score", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_trials", - filename=filename, - display=display, - ) - - -@typechecked -class PredictionPlot(BasePlot): - """Prediction plots. - - Plots that use the model's predictions. These plots are accessible - from the runners or from the models. If called from a runner, the - `models` parameter has to be specified (if None, uses all models). - If called from a model, that model is used and the `models` parameter - becomes unavailable. - - """ - - @available_if(has_task(["binary", "multilabel"])) - @composed(crash, plot_from_model) - def plot_calibration( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str | SEQUENCE = "test", - n_bins: INT = 10, - target: INT | str = 0, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper left", - figsize: tuple[INT, INT] = (900, 900), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the calibration curve for a binary classifier. - - Well calibrated classifiers are probabilistic classifiers for - which the output of the `predict_proba` method can be directly - interpreted as a confidence level. For instance a well - calibrated (binary) classifier should classify the samples such - that among the samples to which it gave a `predict_proba` value - close to 0.8, approx. 80% actually belong to the positive class. - Read more in sklearn's [documentation][calibration]. - - This figure shows two plots: the calibration curve, where the - x-axis represents the average predicted probability in each bin - and the y-axis is the fraction of positives, i.e. the proportion - of samples whose class is the positive class (in each bin); and - a distribution of all predicted probabilities of the classifier. - This plot is available only for models with a `predict_proba` - method in a binary or [multilabel][] classification task. - - !!! tip - Use the [calibrate][adaboost-calibrate] method to calibrate - the winning model. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str or sequence, default="test" - Data set on which to calculate the metric. Use a sequence - or add `+` between options to select more than one. Choose - from: "train", "test" or "holdout". - - target: int or str, default=0 - Target column to look at. Only for [multilabel][] tasks. - - n_bins: int, default=10 - Number of bins used for calibration. Minimum of 5 required. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper left" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 900) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_lift - atom.plots:PredictionPlot.plot_prc - atom.plots:PredictionPlot.plot_roc - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["RF", "LGB"]) - atom.plot_calibration() - ``` - - """ - check_predict_proba(models, "plot_calibration") - dataset = self._get_set(dataset, max_one=False) - target = self.branch._get_target(target, only_columns=True) - - if n_bins < 5: - raise ValueError( - "Invalid value for the n_bins parameter." - f"Value should be >=5, got {n_bins}." - ) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes(y=(0.31, 1.0)) - xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.0, 0.29)) - for m in models: - for ds in dataset: - y_true, y_pred = m._get_pred(ds, target, attr="predict_proba") - - # Get calibration (frac of positives and predicted values) - frac_pos, pred = calibration_curve(y_true, y_pred, n_bins=n_bins) - - fig.add_trace( - self._draw_line( - x=pred, - y=frac_pos, - parent=m.name, - child=ds, - mode="lines+markers", - marker_symbol="circle", - legend=legend, - xaxis=xaxis2, - yaxis=yaxis, - ) - ) - - fig.add_trace( - go.Histogram( - x=y_pred, - xbins=dict(start=0, end=1, size=1. / n_bins), - marker=dict( - color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", - line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), - ), - name=m.name, - legendgroup=m.name, - showlegend=False, - xaxis=xaxis2, - yaxis=yaxis2, - ) - ) - - self._draw_straight_line(y="diagonal", xaxis=xaxis2, yaxis=yaxis) - - fig.update_layout( - { - f"yaxis{yaxis[1:]}_anchor": f"x{xaxis2[1:]}", - f"xaxis{xaxis2[1:]}_showgrid": True, - "barmode": "overlay", - } - ) - - self._plot( - ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"), - xlabel="Predicted value", - ylabel="Count", - xlim=(0, 1), - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - groupclick="togglegroup", - ylabel="Fraction of positives", - ylim=(-0.05, 1.05), - title=title, - legend=legend, - figsize=figsize, - plotname="plot_calibration", - filename=filename, - display=display, - ) - - @available_if(has_task("class")) - @composed(crash, plot_from_model) - def plot_confusion_matrix( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str = "test", - target: INT | str = 0, - threshold: FLOAT = 0.5, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper right", - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot a model's confusion matrix. - - For one model, the plot shows a heatmap. For multiple models, - it compares TP, FP, FN and TN in a barplot (not implemented - for multiclass classification tasks). This plot is available - only for classification tasks. - - !!! tip - Fill the `threshold` parameter with the result from the - model's `get_best_threshold` method to optimize the results. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str, default="test" - Data set on which to calculate the confusion matrix. Choose - from:` "train", "test" or "holdout". - - target: int or str, default=0 - Target column to look at. Only for [multioutput tasks][]. - - threshold: float, default=0.5 - Threshold between 0 and 1 to convert predicted probabilities - to class labels. Only for binary classification tasks. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the plot's type. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_calibration - atom.plots:PredictionPlot.plot_threshold - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, test_size=0.4) - atom.run(["LR", "RF"]) - atom.lr.plot_confusion_matrix() # For one model - atom.plot_confusion_matrix() # For multiple models - ``` - - """ - ds = self._get_set(dataset, max_one=True) - target = self.branch._get_target(target, only_columns=True) - - if self.task.startswith("multiclass") and len(models) > 1: - raise NotImplementedError( - "The plot_confusion_matrix method does not support " - "the comparison of multiple models for multiclass " - "or multiclass-multioutput classification tasks." - ) - - labels = np.array( - (("True negatives", "False positives"), ("False negatives", "True positives")) - ) - - fig = self._get_figure() - if len(models) == 1: - xaxis, yaxis = BasePlot._fig.get_axes( - x=(0, 0.87), - coloraxis=dict( - colorscale="Blues", - cmin=0, - cmax=100, - title="Percentage of samples", - font_size=self.label_fontsize, - ), - ) - else: - xaxis, yaxis = BasePlot._fig.get_axes() - - for m in models: - y_true, y_pred = m._get_pred(ds, target, attr="predict") - if threshold != 0.5: - y_pred = (y_pred > threshold).astype("int") - - cm = confusion_matrix(y_true, y_pred) - if len(models) == 1: # Create matrix heatmap - ticks = m.mapping.get(target, np.unique(m.dataset[target]).astype(str)) - xaxis, yaxis = BasePlot._fig.get_axes( - x=(0, 0.87), - coloraxis=dict( - colorscale="Blues", - cmin=0, - cmax=100, - title="Percentage of samples", - font_size=self.label_fontsize, - ), - ) - - fig.add_trace( - go.Heatmap( - x=ticks, - y=ticks, - z=100. * cm / cm.sum(axis=1)[:, np.newaxis], - coloraxis=f"coloraxis{xaxis[1:]}", - text=cm, - customdata=labels, - texttemplate="%{text}
(%{z:.2f}%)", - textfont=dict(size=self.label_fontsize), - hovertemplate=( - "%{customdata}
" if is_binary(self.task) else "" - "x:%{x}
y:%{y}
z:%{z}" - ), - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout( - { - "template": "plotly_white", - f"yaxis{yaxis[1:]}_autorange": "reversed", - f"xaxis{xaxis[1:]}_showgrid": False, - f"yaxis{yaxis[1:]}_showgrid": False, - } - ) - - else: - color = BasePlot._fig.get_elem(m.name) - fig.add_trace( - go.Bar( - x=cm.ravel(), - y=labels.ravel(), - orientation="h", - marker=dict( - color=f"rgba({color[4:-1]}, 0.2)", - line=dict(width=2, color=color), - ), - hovertemplate="%{x}", - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout(bargroupgap=0.05) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Predicted label" if len(models) == 1 else "Count", - ylabel="True label" if len(models) == 1 else None, - title=title, - legend=legend, - figsize=figsize or ((800, 800) if len(models) == 1 else (900, 600)), - plotname="plot_confusion_matrix", - filename=filename, - display=display, - ) - - @available_if(has_task(["binary", "multilabel"])) - @composed(crash, plot_from_model) - def plot_det( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str | SEQUENCE = "test", - target: INT | str = 0, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ): - """Plot the Detection Error Tradeoff curve. - - Read more about [DET][] in sklearn's documentation. Only - available for binary classification tasks. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str or sequence, default="test" - Data set on which to calculate the metric. Use a sequence - or add `+` between options to select more than one. Choose - from: "train", "test" or "holdout". - - target: int or str, default=0 - Target column to look at. Only for [multilabel][] tasks. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_gains - atom.plots:PredictionPlot.plot_roc - atom.plots:PredictionPlot.plot_prc - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["LR", "RF"]) - atom.plot_det() - ``` - - """ - dataset = self._get_set(dataset, max_one=False) - target = self.branch._get_target(target, only_columns=True) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - for ds in dataset: - # Get fpr-fnr pairs for different thresholds - fpr, fnr, _ = det_curve(*m._get_pred(ds, target, attr="thresh")) - - fig.add_trace( - self._draw_line( - x=fpr, - y=fnr, - mode="lines", - parent=m.name, - child=ds, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="FPR", - ylabel="FNR", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_det", - filename=filename, - display=display, - ) - - @available_if(has_task("reg")) - @composed(crash, plot_from_model) - def plot_errors( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str = "test", - target: INT | str = 0, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot a model's prediction errors. - - Plot the actual targets from a set against the predicted values - generated by the regressor. A linear fit is made on the data. - The gray, intersected line shows the identity line. This plot - can be useful to detect noise or heteroscedasticity along a - range of the target domain. This plot is available only for - regression tasks. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str, default="test" - Data set on which to calculate the metric. Choose from: - "train", "test" or "holdout". - - target: int or str, default=0 - Target column to look at. Only for [multioutput tasks][]. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_residuals - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import load_diabetes - - X, y = load_diabetes(return_X_y=True, as_frame=True) - - atom = ATOMRegressor(X, y) - atom.run(["OLS", "LGB"]) - atom.plot_errors() - ``` - - """ - ds = self._get_set(dataset, max_one=True) - target = self.branch._get_target(target, only_columns=True) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - y_true, y_pred = m._get_pred(ds, target) - - fig.add_trace( - go.Scatter( - x=y_true, - y=y_pred, - mode="markers", - line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - # Fit the points using linear regression - from atom.models import OrdinaryLeastSquares - model = OrdinaryLeastSquares(goal=self.goal, branch=m.branch)._get_est() - model.fit(y_true.values.reshape(-1, 1), y_pred) - - fig.add_trace( - go.Scatter( - x=(x := np.linspace(y_true.min(), y_true.max(), 100)), - y=model.predict(x[:, np.newaxis]), - mode="lines", - line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), - hovertemplate="(%{x}, %{y})", - legendgroup=m.name, - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - groupclick="togglegroup", - xlabel="True value", - title=title, - legend=legend, - ylabel="Predicted value", - figsize=figsize, - plotname="plot_errors", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(ensembles=False)) - def plot_evals( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str | SEQUENCE = "test", - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot evaluation curves. - - The evaluation curves are the main metric scores achieved by the - models at every iteration of the training process. This plot is - available only for models that allow [in-training validation][]. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str or sequence, default="test" - Data set on which to calculate the evaluation curves. Use a - sequence or add `+` between options to select more than one. - Choose from: "train" or "test". - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:HTPlot.plot_trials - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["XGB", "LGB"]) - atom.plot_evals() - ``` - - """ - dataset = self._get_set(dataset, max_one=False, allow_holdout=False) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - if not m.evals: - raise ValueError( - "Invalid value for the models parameter. Model " - f"{m.name} has no in-training validation." - ) - - for ds in dataset: - fig.add_trace( - self._draw_line( - x=list(range(len(m.evals[f"{self._metric[0].name}_{ds}"]))), - y=m.evals[f"{self._metric[0].name}_{ds}"], - marker_symbol="circle", - parent=m.name, - child=ds, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - BasePlot._fig.used_models.append(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Iterations", - ylabel=self._metric[0].name, - title=title, - legend=legend, - figsize=figsize, - plotname="plot_evals", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model) - def plot_feature_importance( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - show: INT | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot a model's feature importance. - - The sum of importances for all features (per model) is 1. - This plot is available only for models whose estimator has - a `scores_`, `feature_importances_` or `coef` attribute. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - show: int or None, default=None - Number of features (ordered by importance) to show. If - None, it shows all features. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of features shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_parshap - atom.plots:PredictionPlot.plot_partial_dependence - atom.plots:PredictionPlot.plot_permutation_importance - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["LR", "RF"]) - atom.plot_feature_importance(show=10) - ``` - - """ - show = self._get_show(show, models) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - if (fi := m.feature_importance) is None: - raise ValueError( - "Invalid value for the models parameter. The estimator " - f"{m.estimator.__class__.__name__} has no feature_importances_ " - "nor coef_ attribute." - ) - - fig.add_trace( - go.Bar( - x=fi, - y=fi.index, - orientation="h", - marker=dict( - color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", - line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), - ), - hovertemplate="%{x}", - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout( - { - f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"), - "bargroupgap": 0.05, - } - ) - - # Unique number of features over all branches - n_fxs = len(set([fx for m in models for fx in m.features])) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Normalized feature importance", - ylim=(n_fxs - show - 0.5, n_fxs - 0.5), - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - plotname="plot_feature_importance", - filename=filename, - display=display, - ) - - @available_if(has_task("forecast")) - @composed(crash, plot_from_model(check_fitted=False)) - def plot_forecast( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - fh: int | str | range | SEQUENCE | ForecastingHorizon = "test", - X: FEATURES | None = None, - target: INT | str = 0, - plot_interval: bool = True, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper left", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot a time series with model forecasts. - - This plot is only available for forecasting tasks. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. If no - models are selected, only the target column is plotted. - - fh: int, str, range, sequence or [ForecastingHorizon][], default="test" - Forecast horizon for which to plot the predictions. If - string, choose from: "train", "test" or "holdout". Use a - sequence or add `+` between options to select more than one. - - X: dataframe-like or None, default=None - Exogenous time series corresponding to fh. This parameter - is ignored if fh is a data set. - - target: int or str, default=0 - Target column to look at. Only for [multivariate][] tasks. - - plot_interval: bool, default=True - Whether to plot prediction intervals instead of the exact - prediction values. If True, the plotted estimators should - have a `predict_interval` method. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper left" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_lift - atom.plots:PredictionPlot.plot_prc - atom.plots:PredictionPlot.plot_roc - - Examples - -------- - ```pycon - from atom import ATOMForecaster - from sktime.datasets import load_airline - - y = load_airline() - - atom = ATOMForecaster(y, random_state=1) - atom.plot_forecast() - atom.run( - models="arima", - est_params={"order": (1, 1, 0), "seasonal_order": (0, 1, 0, 12)}, - ) - atom.plot_forecast() - atom.plot_forecast(fh="train+test", plot_interval=False) - - # Forecast the next 4 years starting from the test set - atom.plot_forecast(fh=range(1, 48)) - ``` - - """ - target = self.branch._get_target(target, only_columns=True) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - # Draw original time series - for ds in ("train", "test"): - fig.add_trace( - go.Scatter( - x=self._get_plot_index(getattr(self, ds)), - y=getattr(self, ds)[target], - mode="lines+markers", - line=dict( - width=2, - color="black", - dash=BasePlot._fig.get_elem(ds, "dash"), - ), - opacity=0.6, - name=ds, - showlegend=False if models else BasePlot._fig.showlegend(ds, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - # Draw predictions - for m in models: - if isinstance(fh, str): - # Get fh and corresponding X from data set - datasets = self._get_set(fh, max_one=False) - fh = bk.concat([getattr(m, ds) for ds in datasets]).index - X = m.X.loc[fh] - - y_pred = m.predict(fh, X) - if is_multioutput(self.task): - y_pred = y_pred[target] - - fig.add_trace( - self._draw_line( - x=self._get_plot_index(y_pred), - y=y_pred, - mode="lines+markers", - parent=m.name, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - if plot_interval: - try: - y_pred = m.predict_interval(fh, X) - except NotImplementedError: - continue # Fails for some models like ES - - if is_multioutput(self.task): - # Select interval of target column for multivariate - y = y_pred.iloc[:, y_pred.columns.get_loc(target)] - else: - y = y_pred # Univariate - - fig.add_traces( - [ - go.Scatter( - x=self._get_plot_index(y_pred), - y=y.iloc[:, 1], - mode="lines", - line=dict(width=1, color=BasePlot._fig.get_elem(m.name)), - hovertemplate=f"%{{y}}{m.name} - upper bound", - legendgroup=m.name, - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ), - go.Scatter( - x=self._get_plot_index(y_pred), - y=y.iloc[:, 0], - mode="lines", - line=dict(width=1, color=BasePlot._fig.get_elem(m.name)), - fill="tonexty", - fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)", - hovertemplate=f"%{{y}}{m.name} - lower bound", - legendgroup=m.name, - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ) - ] - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - groupclick="togglegroup" if plot_interval else "toggleitem", - xlabel=self.y.index.name, - ylabel=target, - title=title, - legend=legend, - figsize=figsize, - plotname="plot_forecast", - filename=filename, - display=display, - ) - - @available_if(has_task(["binary", "multilabel"])) - @composed(crash, plot_from_model) - def plot_gains( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str | SEQUENCE = "test", - target: INT | str = 0, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the cumulative gains curve. - - This plot is available only for binary and [multilabel][] - classification tasks. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str or sequence, default="test" - Data set on which to calculate the metric. Use a sequence - or add `+` between options to select more than one. Choose - from: "train", "test" or "holdout". - - target: int or str, default=0 - Target column to look at. Only for [multilabel][] tasks. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_det - atom.plots:PredictionPlot.plot_lift - atom.plots:PredictionPlot.plot_roc - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["LR", "RF"]) - atom.plot_gains() - ``` - - """ - dataset = self._get_set(dataset, max_one=False) - target = self.branch._get_target(target, only_columns=True) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - for ds in dataset: - y_true, y_pred = m._get_pred(ds, target, attr="thresh") - - fig.add_trace( - self._draw_line( - x=np.arange(start=1, stop=len(y_true) + 1) / len(y_true), - y=np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum(), - mode="lines", - parent=m.name, - child=ds, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Fraction of sample", - ylabel="Gain", - xlim=(0, 1), - ylim=(0, 1.02), - title=title, - legend=legend, - figsize=figsize, - plotname="plot_gains", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(ensembles=False)) - def plot_learning_curve( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - metric: INT | str | SEQUENCE | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the learning curve: score vs number of training samples. - - This plot is available only for models fitted using - [train sizing][]. [Ensembles][] are ignored. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - metric: int, str, sequence or None, default=None - Metric to plot (only for multi-metric runs). Use a sequence - or add `+` between options to select more than one. If None, - the metric used to run the pipeline is selected. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_results - atom.plots:PredictionPlot.plot_successive_halving - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.train_sizing(["LR", "RF"], n_bootstrap=5) - atom.plot_learning_curve() - ``` - - """ - metric = self._get_metric(metric, max_one=False) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - for met in metric: - x, y, std = defaultdict(list), defaultdict(list), defaultdict(list) - for m in models: - x[m._group].append(m._train_idx) - y[m._group].append(get_best_score(m, met)) - if m.bootstrap is not None: - std[m._group].append(m.bootstrap.iloc[:, met].std()) - - for group in x: - fig.add_trace( - self._draw_line( - x=x[group], - y=y[group], - mode="lines+markers", - marker_symbol="circle", - error_y=dict(type="data", array=std[group], visible=True), - parent=group, - child=self._metric[met].name, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - # Add error bands - if m.bootstrap is not None: - fillcolor = f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)" - fig.add_traces( - [ - go.Scatter( - x=x[group], - y=np.add(y[group], std[group]), - mode="lines", - line=dict(width=1, color=BasePlot._fig.get_elem(group)), - hovertemplate="%{y}upper bound", - legendgroup=group, - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ), - go.Scatter( - x=x[group], - y=np.subtract(y[group], std[group]), - mode="lines", - line=dict(width=1, color=BasePlot._fig.get_elem(group)), - fill="tonexty", - fillcolor=fillcolor, - hovertemplate="%{y}lower bound", - legendgroup=group, - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ), - ] - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - groupclick="togglegroup", - title=title, - legend=legend, - xlabel="Number of training samples", - ylabel="Score", - figsize=figsize, - plotname="plot_learning_curve", - filename=filename, - display=display, - ) - - @available_if(has_task(["binary", "multilabel"])) - @composed(crash, plot_from_model) - def plot_lift( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str | SEQUENCE = "test", - target: INT | str = 0, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the lift curve. - - Only available for binary classification tasks. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str or sequence, default="test" - Data set on which to calculate the metric. Use a sequence - or add `+` between options to select more than one. Choose - from: "train", "test" or "holdout". - - target: int or str, default=0 - Target column to look at. Only for [multilabel][] tasks. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_det - atom.plots:PredictionPlot.plot_gains - atom.plots:PredictionPlot.plot_prc - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["LR", "RF"]) - atom.plot_lift() - ``` - - """ - dataset = self._get_set(dataset, max_one=False) - target = self.branch._get_target(target, only_columns=True) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - for ds in dataset: - y_true, y_pred = m._get_pred(ds, target, attr="thresh") - - gains = np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum() - fig.add_trace( - self._draw_line( - x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)), - y=gains / x, - mode="lines", - parent=m.name, - child=ds, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - self._draw_straight_line(y=1, xaxis=xaxis, yaxis=yaxis) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Fraction of sample", - ylabel="Lift", - xlim=(0, 1), - title=title, - legend=legend, - figsize=figsize, - plotname="plot_lift", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model) - def plot_parshap( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - columns: SLICE | None = None, - target: INT | str | tuple = 1, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper left", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the partial correlation of shap values. - - Plots the train and test correlation between the shap value of - every feature with its target value, after removing the effect - of all other features (partial correlation). This plot is - useful to identify the features that are contributing most to - overfitting. Features that lie below the bisector (diagonal - line) performed worse on the test set than on the training set. - If the estimator has a `scores_`, `feature_importances_` or - `coef_` attribute, its normalized values are shown in a color - map. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - columns: int, str, slice, sequence or None, default=None - Features to plot. If None, it plots all features. - - target: int, str or tuple, default=1 - Class in the target column to target. For multioutput tasks, - the value should be a tuple of the form (column, class). - Note that for binary and multilabel tasks, the selected - class is always the positive one. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper left" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_feature_importance - atom.plots:PredictionPlot.plot_partial_dependence - atom.plots:PredictionPlot.plot_permutation_importance - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["GNB", "RF"]) - atom.rf.plot_parshap(legend=None) - atom.plot_parshap(columns=slice(5, 10)) - ``` - - """ - target = self.branch._get_target(target) - - fig = self._get_figure() - - # Colorbar is only needed when a model has feature_importance - if all(m.feature_importance is None for m in models): - xaxis, yaxis = BasePlot._fig.get_axes() - else: - xaxis, yaxis = BasePlot._fig.get_axes( - x=(0, 0.87), - coloraxis=dict( - colorscale="Reds", - title="Normalized feature importance", - font_size=self.label_fontsize, - ) - ) - - for m in models: - parshap = {} - fxs = m.branch._get_columns(columns, include_target=False) - - for ds in ("train", "test"): - # Calculating shap values is computationally expensive, - # therefore select a random subsample for large data sets - if len(data := getattr(m, ds)) > 500: - data = data.sample(500, random_state=self.random_state) - - # Replace data with the calculated shap values - explanation = m._shap.get_explanation(data[m.features], target) - data[m.features] = explanation.values - - parshap[ds] = pd.Series(index=fxs, dtype=float) - for fx in fxs: - # All other features are covariates - covariates = [f for f in data.columns[:-1] if f != fx] - cols = [fx, data.columns[-1], *covariates] - - # Compute covariance - V = data[cols].cov() - - # Inverse covariance matrix - Vi = np.linalg.pinv(V, hermitian=True) - diag = Vi.diagonal() - - D = np.diag(np.sqrt(1 / diag)) - - # Partial correlation matrix - partial_corr = -1 * (D @ Vi @ D) # @ is matrix multiplication - - # Semi-partial correlation matrix - with np.errstate(divide="ignore"): - V_sqrt = np.sqrt(np.diag(V))[..., None] - Vi_sqrt = np.sqrt(np.abs(diag - Vi ** 2 / diag[..., None])).T - semi_partial_correlation = partial_corr / V_sqrt / Vi_sqrt - - # X covariates are removed - parshap[ds][fx] = semi_partial_correlation[1, 0] - - # Get the feature importance or coefficients - if m.feature_importance is not None: - color = m.feature_importance.loc[fxs] - else: - color = BasePlot._fig.get_elem("parshap") - - fig.add_trace( - go.Scatter( - x=parshap["train"], - y=parshap["test"], - mode="markers+text", - marker=dict( - color=color, - size=self.marker_size, - coloraxis=f"coloraxis{xaxis[1:]}", - line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), - ), - text=m.features, - textposition="top center", - customdata=(data := None if isinstance(color, str) else list(color)), - hovertemplate=( - f"%{{text}}
(%{{x}}, %{{y}})" - f"{'
Feature importance: %{customdata:.4f}' if data else ''}" - f"{m.name}" - ), - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Training set", - ylabel="Test set", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_parshap", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model) - def plot_partial_dependence( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - columns: SLICE | None = None, - kind: str | SEQUENCE = "average", - pair: int | str | None = None, - target: INT | str = 1, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the partial dependence of features. - - The partial dependence of a feature (or a set of features) - corresponds to the response of the model for each possible - value of the feature. The plot can take two forms: - - - If `pair` is None: Single feature partial dependence lines. - The deciles of the feature values are shown with tick marks - on the bottom. - - If `pair` is defined: Two-way partial dependence plots are - plotted as contour plots (only allowed for a single model). - - Read more about partial dependence on sklearn's - [documentation][partial_dependence]. This plot is not available - for multilabel nor multiclass-multioutput classification tasks. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - columns: int, str, slice, sequence or None, default=None - Features to get the partial dependence from. If None, it - uses the first 3 features in the dataset. - - kind: str or sequence, default="average" - Kind of depedence to plot. Use a sequence or add `+` between - options to select more than one. Choose from: - - - "average": Partial dependence averaged across all samples - in the dataset. - - "individual": Partial dependence for up to 50 random - samples (Individual Conditional Expectation). - - This parameter is ignored when plotting feature pairs. - - pair: int, str or None, default=None - Feature with which to pair the features selected by - `columns`. If specified, the resulting figure displays - contour plots. Only allowed when plotting a single model. - If None, the plots show the partial dependece of single - features. - - target: int or str, default=1 - Class in the target column to look at (only for multiclass - classification tasks). - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_feature_importance - atom.plots:PredictionPlot.plot_parshap - atom.plots:PredictionPlot.plot_permutation_importance - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["LR", "RF"]) - atom.plot_partial_dependence(kind="average+individual", legend="upper left") - atom.rf.plot_partial_dependence(columns=(3, 4), pair=2) - ``` - - """ - if any(self.task.startswith(t) for t in ("multilabel", "multiclass-multioutput")): - raise PermissionError( - "The plot_partial_dependence method is not available for multilabel " - f"nor multiclass-multioutput classification tasks, got {self.task}." - ) - elif self.task.startswith("multiclass"): - _, target = self.branch._get_target(target) - else: - target = 0 - - kind = "+".join(lst(kind)).lower() - if any(k not in ("average", "individual") for k in kind.split("+")): - raise ValueError( - f"Invalid value for the kind parameter, got {kind}. " - "Choose from: average, individual." - ) - - axes, names = [], [] - fig = self._get_figure() - for m in models: - color = BasePlot._fig.get_elem(m.name) - - # Since every model can have different fxs, select them - # every time and make sure the models use the same fxs - cols = m.branch._get_columns( - columns=(0, 1, 2) if columns is None else columns, - include_target=False, - ) - - if not names: - names = cols - elif names != cols: - raise ValueError( - "Invalid value for the columns parameter. Not all " - f"models use the same features, got {names} and {cols}." - ) - - if pair is not None: - if len(models) > 1: - raise ValueError( - f"Invalid value for the pair parameter, got {pair}. " - "The value must be None when plotting multiple models" - ) - else: - pair = m.branch._get_columns(pair, include_target=False) - cols = [(c, pair[0]) for c in cols] - else: - cols = [(c,) for c in cols] - - # Create new axes - if not axes: - for i, col in enumerate(cols): - # Calculate the distance between subplots - offset = divide(0.025, len(cols) - 1) - - # Calculate the size of the subplot - size = (1 - ((offset * 2) * (len(cols) - 1))) / len(cols) - - # Determine the position for the axes - x_pos = i % len(cols) * (size + 2 * offset) - - xaxis, yaxis = BasePlot._fig.get_axes(x=(x_pos, rnd(x_pos + size))) - axes.append((xaxis, yaxis)) - - # Compute averaged predictions - predictions = Parallel(n_jobs=self.n_jobs, backend=self.backend)( - delayed(partial_dependence)( - estimator=m.estimator, - X=m.X_test, - features=col, - kind="both" if "individual" in kind else "average", - ) for col in cols - ) - - # Compute deciles for ticks (only if line plots) - if len(cols[0]) == 1: - deciles = {} - for fx in chain.from_iterable(cols): - if fx not in deciles: # Skip if the feature is repeated - X_col = _safe_indexing(m.X_test, fx, axis=1) - deciles[fx] = mquantiles(X_col, prob=np.arange(0.1, 1.0, 0.1)) - - for i, (ax, fx, pred) in enumerate(zip(axes, cols, predictions)): - # Draw line or contour plot - if len(pred["values"]) == 1: - # For both average and individual: draw ticks on the horizontal axis - for line in deciles[fx[0]]: - fig.add_shape( - type="line", - x0=line, - x1=line, - xref=ax[0], - y0=0, - y1=0.05, - yref=f"{axes[0][1]} domain", - line=dict(width=1, color=BasePlot._fig.get_elem(m.name)), - opacity=0.6, - layer="below", - ) - - # Draw the mean of the individual lines - if "average" in kind: - fig.add_trace( - go.Scatter( - x=pred["values"][0], - y=pred["average"][target].ravel(), - mode="lines", - line=dict(width=2, color=color), - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=ax[0], - yaxis=axes[0][1], - ) - ) - - # Draw all individual (per sample) lines (ICE) - if "individual" in kind: - # Select up to 50 random samples to plot - idx = np.random.choice( - list(range(len(pred["individual"][target]))), - size=min(len(pred["individual"][target]), 50), - replace=False, - ) - for sample in pred["individual"][target, idx, :]: - fig.add_trace( - go.Scatter( - x=pred["values"][0], - y=sample, - mode="lines", - line=dict(width=0.5, color=color), - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=ax[0], - yaxis=axes[0][1], - ) - ) - - else: - colorscale = PALETTE.get(BasePlot._fig.get_elem(m.name), "Teal") - fig.add_trace( - go.Contour( - x=pred["values"][0], - y=pred["values"][1], - z=pred["average"][target], - contours=dict( - showlabels=True, - labelfont=dict(size=self.tick_fontsize, color="white") - ), - hovertemplate="x:%{x}
y:%{y}
z:%{z}", - hoverongaps=False, - colorscale=colorscale, - showscale=False, - showlegend=False, - xaxis=ax[0], - yaxis=axes[0][1], - ) - ) - - self._plot( - ax=(f"xaxis{ax[0][1:]}", f"yaxis{ax[1][1:]}"), - xlabel=fx[0], - ylabel=(fx[1] if len(fx) > 1 else "Score") if i == 0 else None, - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - groupclick="togglegroup", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_partial_dependence", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model) - def plot_permutation_importance( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - show: INT | None = None, - n_repeats: INT = 10, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the feature permutation importance of models. - - !!! warning - This method can be slow. Results are cached to fasten - repeated calls. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - show: int or None, default=None - Number of features (ordered by importance) to show. If - None, it shows all features. - - n_repeats: int, default=10 - Number of times to permute each feature. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of features shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_feature_importance - atom.plots:PredictionPlot.plot_partial_dependence - atom.plots:PredictionPlot.plot_parshap - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["LR", "RF"]) - atom.plot_permutation_importance(show=10, n_repeats=7) - ``` - - """ - show = self._get_show(show, models) - - if n_repeats <= 0: - raise ValueError( - "Invalid value for the n_repeats parameter." - f"Value should be >0, got {n_repeats}." - ) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - for m in models: - # Permutation importances returns Bunch object - permutations = self._memory.cache(permutation_importance)( - estimator=m.estimator, - X=m.X_test, - y=m.y_test, - scoring=self._metric[0], - n_repeats=n_repeats, - n_jobs=self.n_jobs, - random_state=self.random_state, - ) - - fig.add_trace( - go.Box( - x=permutations["importances"].ravel(), - y=list(np.array([[fx] * n_repeats for fx in m.features]).ravel()), - marker_color=BasePlot._fig.get_elem(m.name), - boxpoints="outliers", - orientation="h", - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout( - { - f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"), - "boxmode": "group", - } - ) - - # Unique number of features over all branches - n_fxs = len(set([fx for m in models for fx in m.features])) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Score", - ylim=(n_fxs - show - 0.5, n_fxs - 0.5), - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - plotname="plot_permutation_importance", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(check_fitted=False)) - def plot_pipeline( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - draw_hyperparameter_tuning: bool = True, - color_branches: bool | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> plt.Figure | None: - """Plot a diagram of the pipeline. - - !!! warning - This plot uses the [schemdraw][] package, which is - incompatible with [plotly][]. The returned plot is - therefore a [matplotlib figure][pltfigure]. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models for which to draw the pipeline. If None, all - pipelines are plotted. - - draw_hyperparameter_tuning: bool, default=True - Whether to draw if the models used Hyperparameter Tuning. - - color_branches: bool or None, default=None - Whether to draw every branch in a different color. If None, - branches are colored when there is more than one. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the pipeline drawn. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as png. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [plt.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:DataPlot.plot_wordcloud - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["GNB", "RNN", "SGD", "MLP"]) - atom.voting(models=atom.winners[:2]) - atom.plot_pipeline() - - atom = ATOMClassifier(X, y, random_state=1) - atom.scale() - atom.prune() - atom.run("RF", n_trials=30) - - atom.branch = "undersample" - atom.balance("nearmiss") - atom.run("RF_undersample") - - atom.branch = "oversample_from_master" - atom.balance("smote") - atom.run("RF_oversample") - - atom.plot_pipeline() - ``` - - """ - - def get_length(pl, i): - """Get the maximum length of the name of a block.""" - if len(pl) > i: - return max(len(pl[i].__class__.__name__) * 0.5, 7) - else: - return 0 - - def check_y(xy): - """Return y unless there is something right, then jump.""" - while any(pos[0] > xy[0] and pos[1] == xy[1] for pos in positions.values()): - xy = Point((xy[0], xy[1] + height)) - - return xy[1] - - def add_wire(x, y): - """Draw a connecting wire between two estimators.""" - d.add( - Wire(shape="z", k=(x - d.here[0]) / (length + 1), arrow="->") - .to((x, y)) - .color(branch["color"]) - ) - - # Update arrowhead manually - d.elements[-1].segments[-1].arrowwidth = 0.3 - d.elements[-1].segments[-1].arrowlength = 0.5 - - check_dependency("schemdraw") - from schemdraw import Drawing - from schemdraw.flow import Data, RoundBox, Subroutine, Wire - from schemdraw.util import Point - - fig = self._get_figure(backend="matplotlib") - check_canvas(BasePlot._fig.is_canvas, "plot_pipeline") - - # Define branches to plot (if called from model, it's only one) - branches = [] - for branch in getattr(self, "_branches", [self.branch]): - draw_models, draw_ensembles = [], [] - for m in models: - if m.branch is branch: - if m.acronym not in ("Stack", "Vote"): - draw_models.append(m) - else: - draw_ensembles.append(m) - - # Additionally, add all dependent models (if not already there) - draw_models.extend([i for i in m._models if i not in draw_models]) - - if not models or draw_models: - branches.append( - { - "name": branch.name, - "pipeline": list(branch.pipeline), - "models": draw_models, - "ensembles": draw_ensembles, - } - ) - - # Define colors per branch - for branch in branches: - if color_branches or (color_branches is None and len(branches) > 1): - color = next(BasePlot._fig.palette) - - # Convert back to format accepted by matplotlib - branch["color"] = unconvert_from_RGB_255(unlabel_rgb(color)) - else: - branch["color"] = "black" - - # Create schematic drawing - d = Drawing(unit=1, backend="matplotlib") - d.config(fontsize=self.tick_fontsize) - d.add(Subroutine(w=8, s=0.7).label("Raw data")) - - height = 3 # Height of every block - length = 5 # Minimum arrow length - - # Define the x-position for every block - x_pos = [d.here[0] + length] - for i in range(max(len(b["pipeline"]) for b in branches)): - len_block = reduce(max, [get_length(b["pipeline"], i) for b in branches]) - x_pos.append(x_pos[-1] + length + len_block) - - # Add positions for scaling, hyperparameter tuning and models - x_pos.extend([x_pos[-1], x_pos[-1]]) - if any(m.scaler for m in models): - x_pos[-1] = x_pos[-2] = x_pos[-3] + length + 7 - if draw_hyperparameter_tuning and any(m.trials is not None for m in models): - x_pos[-1] = x_pos[-2] + length + 11 - - positions = {0: d.here} # Contains the position of every element - for branch in branches: - d.here = positions[0] - - for i, est in enumerate(branch["pipeline"]): - # If the estimator has already been seen, don't draw - if id(est) in positions: - # Change location to estimator's end - d.here = positions[id(est)] - continue - - # Draw transformer - add_wire(x_pos[i], check_y(d.here)) - d.add( - RoundBox(w=max(len(est.__class__.__name__) * 0.5, 7)) - .label(est.__class__.__name__, color="k") - .color(branch["color"]) - .anchor("W") - .drop("E") - ) - - positions[id(est)] = d.here - - for model in branch["models"]: - # Position at last transformer or at start - if branch["pipeline"]: - d.here = positions[id(est)] - else: - d.here = positions[0] - - # For a single branch, center models - if len(branches) == 1: - offset = height * (len(branch["models"]) - 1) / 2 - else: - offset = 0 - - # Draw automated feature scaling - if model.scaler: - add_wire(x_pos[-3], check_y((d.here[0], d.here[1] - offset))) - d.add( - RoundBox(w=7) - .label("Scaler", color="k") - .color(branch["color"]) - .drop("E") - ) - offset = 0 - - # Draw hyperparameter tuning - if draw_hyperparameter_tuning and model.trials is not None: - add_wire(x_pos[-2], check_y((d.here[0], d.here[1] - offset))) - d.add( - Data(w=11) - .label("Hyperparameter\nTuning", color="k") - .color(branch["color"]) - .drop("E") - ) - offset = 0 - - # Remove classifier/regressor from model's name - name = model.estimator.__class__.__name__ - if name.lower().endswith("classifier"): - name = name[:-10] - elif name.lower().endswith("regressor"): - name = name[:-9] - - # Draw model - add_wire(x_pos[-1], check_y((d.here[0], d.here[1] - offset))) - d.add( - Data(w=max(len(name) * 0.5, 7)) - .label(name, color="k") - .color(branch["color"]) - .anchor("W") - .drop("E") - ) - - positions[id(model)] = d.here - - # Draw ensembles - max_pos = max(pos[0] for pos in positions.values()) # Max length model names - for branch in branches: - for model in branch["ensembles"]: - # Determine y-position of the ensemble - y_pos = [positions[id(m)][1] for m in model._models] - offset = height / 2 * (len(branch["ensembles"]) - 1) - y = min(y_pos) + (max(y_pos) - min(y_pos)) * 0.5 - offset - y = check_y((max_pos + length, max(min(y_pos), y))) - - d.here = (max_pos + length, y) - - d.add( - Data(w=max(len(model._fullname) * 0.5, 7)) - .label(model._fullname, color="k") - .color(branch["color"]) - .anchor("W") - .drop("E") - ) - - positions[id(model)] = d.here - - # Draw a wire from every model to the ensemble - for m in model._models: - d.here = positions[id(m)] - add_wire(max_pos + length, y) - - if not figsize: - dpi, bbox = fig.get_dpi(), d.get_bbox() - figsize = (dpi * bbox.xmax // 4, (dpi / 2) * (bbox.ymax - bbox.ymin)) - - d.draw(canvas=plt.gca(), showframe=False, show=False) - plt.axis("off") - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=plt.gca(), - title=title, - legend=legend, - figsize=figsize, - plotname="plot_pipeline", - filename=filename, - display=display, - ) - - @available_if(has_task(["binary", "multilabel"])) - @composed(crash, plot_from_model) - def plot_prc( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str | SEQUENCE = "test", - target: INT | str = 0, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower left", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the precision-recall curve. - - Read more about [PRC][] in sklearn's documentation. Only - available for binary classification tasks. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str or sequence, default="test" - Data set on which to calculate the metric. Use a sequence - or add `+` between options to select more than one. Choose - from: "train", "test" or "holdout". - - target: int or str, default=0 - Target column to look at. Only for [multilabel][] tasks. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower left" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_det - atom.plots:PredictionPlot.plot_lift - atom.plots:PredictionPlot.plot_roc - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["LR", "RF"]) - atom.plot_prc() - ``` - - """ - dataset = self._get_set(dataset, max_one=False) - target = self.branch._get_target(target, only_columns=True) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - for ds in dataset: - y_true, y_pred = m._get_pred(ds, target, attr="thresh") - - # Get precision-recall pairs for different thresholds - prec, rec, _ = precision_recall_curve(y_true, y_pred) - - fig.add_trace( - self._draw_line( - x=rec, - y=prec, - mode="lines", - parent=m.name, - child=ds, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - self._draw_straight_line(sum(m.y_test) / len(m.y_test), xaxis=xaxis, yaxis=yaxis) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Recall", - ylabel="Precision", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_prc", - filename=filename, - display=display, - ) - - @available_if(has_task("class")) - @composed(crash, plot_from_model) - def plot_probabilities( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str = "test", - target: INT | str | tuple = 1, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the probability distribution of the target classes. - - This plot is available only for models with a `predict_proba` - method in classification tasks. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str, default="test" - Data set on which to calculate the metric. Choose from: - "train", "test" or "holdout". - - target: int, str or tuple, default=1 - Probability of being that class in the target column. For - multioutput tasks, the value should be a tuple of the form - (column, class). - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_confusion_matrix - atom.plots:PredictionPlot.plot_results - atom.plots:PredictionPlot.plot_threshold - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["LR", "RF"]) - atom.plot_probabilities() - ``` - - """ - check_predict_proba(models, "plot_probabilities") - ds = self._get_set(dataset, max_one=True) - col, cls = self.branch._get_target(target) - col = lst(self.target)[col] - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - y_true, y_pred = getattr(m, f"y_{ds}"), getattr(m, f"predict_proba_{ds}") - for value in np.unique(m.dataset[col]): - # Get indices per class - if is_multioutput(self.task): - if self.task.startswith("multilabel"): - hist = y_pred.loc[y_true[col] == value, col] - else: - hist = y_pred.loc[cls, col].loc[y_true[col] == value] - else: - hist = y_pred.loc[y_true == value, str(cls)] - - fig.add_trace( - go.Scatter( - x=(x := np.linspace(0, 1, 100)), - y=stats.gaussian_kde(hist)(x), - mode="lines", - line=dict( - width=2, - color=BasePlot._fig.get_elem(m.name), - dash=BasePlot._fig.get_elem(ds, "dash"), - ), - fill="tonexty", - fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)", - fillpattern=dict(shape=BasePlot._fig.get_elem(value, "shape")), - name=f"{col}={value}", - legendgroup=m.name, - legendgrouptitle=dict(text=m.name, font_size=self.label_fontsize), - showlegend=BasePlot._fig.showlegend(f"{m.name}-{value}", legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - groupclick="toggleitem", - xlabel="Probability", - ylabel="Probability density", - xlim=(0, 1), - title=title, - legend=legend, - figsize=figsize, - plotname="plot_probabilities", - filename=filename, - display=display, - ) - - @available_if(has_task("reg")) - @composed(crash, plot_from_model) - def plot_residuals( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str = "test", - target: INT | str = 0, - *, - title: str | dict | None = None, - legend: str | dict | None = "upper left", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot a model's residuals. - - The plot shows the residuals (difference between the predicted - and the true value) on the vertical axis and the independent - variable on the horizontal axis. The gray, intersected line - shows the identity line. This plot can be useful to analyze the - variance of the error of the regressor. If the points are - randomly dispersed around the horizontal axis, a linear - regression model is appropriate for the data; otherwise, a - non-linear model is more appropriate. This plot is only - available for regression tasks. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str, default="test" - Data set on which to calculate the metric. Choose from: - "train", "test" or "holdout". - - target: int or str, default=0 - Target column to look at. Only for [multioutput tasks][]. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="upper left" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_errors - - Examples - -------- - ```pycon - from atom import ATOMRegressor - from sklearn.datasets import load_diabetes - - X, y = load_diabetes(return_X_y=True, as_frame=True) - - atom = ATOMRegressor(X, y) - atom.run(["OLS", "LGB"]) - atom.plot_residuals() - ``` - - """ - ds = self._get_set(dataset, max_one=True) - target = self.branch._get_target(target, only_columns=True) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes(x=(0, 0.69)) - xaxis2, yaxis2 = BasePlot._fig.get_axes(x=(0.71, 1.0)) - for m in models: - y_true, y_pred = m._get_pred(ds, target) - - fig.add_trace( - go.Scatter( - x=y_true, - y=(res := np.subtract(y_true, y_pred)), - mode="markers", - line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), - name=m.name, - legendgroup=m.name, - showlegend=BasePlot._fig.showlegend(m.name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.add_trace( - go.Histogram( - y=res, - bingroup="residuals", - marker=dict( - color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", - line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), - ), - name=m.name, - legendgroup=m.name, - showlegend=False, - xaxis=xaxis2, - yaxis=yaxis, - ) - ) - - self._draw_straight_line(y=0, xaxis=xaxis, yaxis=yaxis) - - fig.update_layout({f"yaxis{xaxis[1:]}_showgrid": True, "barmode": "overlay"}) - - self._plot( - ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"), - xlabel="Distribution", - title=title, - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - groupclick="togglegroup", - ylabel="Residuals", - xlabel="True value", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_residuals", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model) - def plot_results( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - metric: INT | str | SEQUENCE | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the model results. - - If all models applied bootstrap, the plot is a boxplot. If - not, the plot is a barplot. Models are ordered based on - their score from the top down. The score is either the - `score_bootstrap` or `score_test` attribute of the model, - selected in that order. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - metric: int, str, sequence or None, default=None - Metric to plot (only for multi-metric runs). Other available - options are "time_bo", "time_fit", "time_bootstrap" and - "time". If str, add `+` between options to select more than - one. If None, the metric used to run the pipeline is selected. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of models. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_confusion_matrix - atom.plots:PredictionPlot.plot_probabilities - atom.plots:PredictionPlot.plot_threshold - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["GNB", "LR", "RF", "LGB"], metric=["f1", "recall"]) - atom.plot_results() - - atom.run(["GNB", "LR", "RF", "LGB"], metric=["f1", "recall"], n_bootstrap=5) - atom.plot_results() - atom.plot_results(metric="time_fit+time") - ``` - - """ - - def get_std(model: MODEL, metric: int) -> SCALAR: - """Get the standard deviation of the bootstrap scores. - - Parameters - ---------- - model: Model - Model to get the std from. - - metric: int - Index of the metric to get it from. - - Returns - ------- - int or float - Standard deviation score or 0 if not bootstrapped. - - """ - if model.bootstrap is None: - return 0 - else: - return model.bootstrap.iloc[:, metric].std() - - metric = self._get_metric(metric, max_one=False) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - for met in metric: - if isinstance(met, str): - color = BasePlot._fig.get_elem(met) - fig.add_trace( - go.Bar( - x=[getattr(m, met) for m in models], - y=[m.name for m in models], - orientation="h", - marker=dict( - color=f"rgba({color[4:-1]}, 0.2)", - line=dict(width=2, color=color), - ), - hovertemplate=f"%{{x}}{met}", - name=met, - legendgroup=met, - showlegend=BasePlot._fig.showlegend(met, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - else: - name = self._metric[met].name - color = BasePlot._fig.get_elem() - - if all(m.score_bootstrap for m in models): - x = np.array([m.bootstrap.iloc[:, met] for m in models]).ravel() - y = np.array([[m.name] * len(m.bootstrap) for m in models]).ravel() - fig.add_trace( - go.Box( - x=x, - y=list(y), - marker_color=color, - boxpoints="outliers", - orientation="h", - name=name, - legendgroup=name, - showlegend=BasePlot._fig.showlegend(name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - else: - fig.add_trace( - go.Bar( - x=[get_best_score(m, met) for m in models], - y=[m.name for m in models], - error_x=dict( - type="data", - array=[get_std(m, met) for m in models], - ), - orientation="h", - marker=dict( - color=f"rgba({color[4:-1]}, 0.2)", - line=dict(width=2, color=color), - ), - hovertemplate="%{x}", - name=name, - legendgroup=name, - showlegend=BasePlot._fig.showlegend(name, legend), - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - fig.update_layout( - { - f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"), - "bargroupgap": 0.05, - "boxmode": "group", - } - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="time (s)" if all(isinstance(m, str) for m in metric) else "Score", - title=title, - legend=legend, - figsize=figsize or (900, 400 + len(models) * 50), - plotname="plot_results", - filename=filename, - display=display, - ) - - @available_if(has_task(["binary", "multilabel"])) - @composed(crash, plot_from_model) - def plot_roc( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - dataset: str | SEQUENCE = "test", - target: INT | str = 0, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot the Receiver Operating Characteristics curve. - - Read more about [ROC][] in sklearn's documentation. Only - available for classification tasks. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - dataset: str or sequence, default="test" - Data set on which to calculate the metric. Use a sequence - or add `+` between options to select more than one. Choose - from: "train", "test" or "holdout". - - target: int or str, default=0 - Target column to look at. Only for [multilabel][] tasks. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_gains - atom.plots:PredictionPlot.plot_lift - atom.plots:PredictionPlot.plot_prc - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["LR", "RF"]) - atom.plot_roc() - ``` - - """ - dataset = self._get_set(dataset, max_one=False) - target = self.branch._get_target(target, only_columns=True) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - for m in models: - for ds in dataset: - # Get False (True) Positive Rate as arrays - fpr, tpr, _ = roc_curve(*m._get_pred(ds, target, attr="thresh")) - - fig.add_trace( - self._draw_line( - x=fpr, - y=tpr, - mode="lines", - parent=m.name, - child=ds, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlim=(-0.03, 1.03), - ylim=(-0.03, 1.03), - xlabel="FPR", - ylabel="TPR", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_roc", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(ensembles=False)) - def plot_successive_halving( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - metric: INT | str | SEQUENCE | None = None, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower right", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot scores per iteration of the successive halving. - - Only use with models fitted using [successive halving][]. - [Ensembles][] are ignored. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - metric: int, str, sequence or None, default=None - Metric to plot (only for multi-metric runs). Use a sequence - or add `+` between options to select more than one. If None, - the metric used to run the pipeline is selected. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower right" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_learning_curve - atom.plots:PredictionPlot.plot_results - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.successive_halving(["Tree", "Bag", "RF", "LGB"], n_bootstrap=5) - atom.plot_successive_halving() - ``` - - """ - metric = self._get_metric(metric, max_one=False) - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - for met in metric: - x, y, std = defaultdict(list), defaultdict(list), defaultdict(list) - for m in models: - x[m._group].append(len(m.branch._idx[1]) // m._train_idx) - y[m._group].append(get_best_score(m, met)) - if m.bootstrap is not None: - std[m._group].append(m.bootstrap.iloc[:, met].std()) - - for group in x: - fig.add_trace( - self._draw_line( - x=x[group], - y=y[group], - mode="lines+markers", - marker_symbol="circle", - error_y=dict(type="data", array=std[group], visible=True), - parent=group, - child=self._metric[met].name, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - # Add error bands - if m.bootstrap is not None: - fillcolor = f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)" - fig.add_traces( - [ - go.Scatter( - x=x[group], - y=np.add(y[group], std[group]), - mode="lines", - line=dict(width=1, color=BasePlot._fig.get_elem(group)), - hovertemplate="%{y}upper bound", - legendgroup=group, - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ), - go.Scatter( - x=x[group], - y=np.subtract(y[group], std[group]), - mode="lines", - line=dict(width=1, color=BasePlot._fig.get_elem(group)), - fill="tonexty", - fillcolor=fillcolor, - hovertemplate="%{y}lower bound", - legendgroup=group, - showlegend=False, - xaxis=xaxis, - yaxis=yaxis, - ), - ] - ) - - fig.update_layout({f"xaxis{yaxis[1:]}": dict(dtick=1, autorange="reversed")}) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - groupclick="togglegroup", - title=title, - legend=legend, - xlabel="n_models", - ylabel="Score", - figsize=figsize, - plotname="plot_successive_halving", - filename=filename, - display=display, - ) - - @available_if(has_task(["binary", "multilabel"])) - @composed(crash, plot_from_model) - def plot_threshold( - self, - models: INT | str | MODEL | slice | SEQUENCE | None = None, - metric: METRIC_SELECTOR = None, - dataset: str = "test", - target: INT | str = 0, - steps: INT = 100, - *, - title: str | dict | None = None, - legend: str | dict | None = "lower left", - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> go.Figure | None: - """Plot metric performances against threshold values. - - This plot is available only for models with a `predict_proba` - method in a binary or [multilabel][] classification task. - - Parameters - ---------- - models: int, str, Model, slice, sequence or None, default=None - Models to plot. If None, all models are selected. - - metric: str, func, scorer, sequence or None, default=None - Metric to plot. Choose from any of sklearn's scorers, a - function with signature `metric(y_true, y_pred)`, a scorer - object or a sequence of these. Use a sequence or add `+` - between options to select more than one. If None, the - metric used to run the pipeline is selected. - - dataset: str, default="test" - Data set on which to calculate the metric. Choose from: - "train", "test" or "holdout". - - target: int or str, default=0 - Target column to look at. Only for [multilabel][] tasks. - - steps: int, default=100 - Number of thresholds measured. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default="lower left" - Legend for the plot. See the [user guide][parameters] for - an extended description of the choices. - - - If None: No legend is shown. - - If str: Location where to show the legend. - - If dict: Legend configuration. - - figsize: tuple, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as html. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [go.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_calibration - atom.plots:PredictionPlot.plot_confusion_matrix - atom.plots:PredictionPlot.plot_probabilities - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import make_classification - - X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run(["LR", "RF"]) - atom.plot_threshold() - ``` - - """ - check_predict_proba(models, "plot_threshold") - ds = self._get_set(dataset, max_one=True) - target = self.branch._get_target(target, only_columns=True) - - # Get all metric functions from the input - if metric is None: - metrics = [m._score_func for m in self._metric] - else: - metrics = [] - for m in lst(metric): - if isinstance(m, str): - metrics.extend(m.split("+")) - else: - metrics.append(m) - metrics = [get_custom_scorer(m)._score_func for m in metrics] - - fig = self._get_figure() - xaxis, yaxis = BasePlot._fig.get_axes() - - steps = np.linspace(0, 1, steps) - for m in models: - y_true, y_pred = m._get_pred(ds, target, attr="predict_proba") - for met in metrics: - fig.add_trace( - self._draw_line( - x=steps, - y=[met(y_true, y_pred >= step) for step in steps], - parent=m.name, - child=met.__name__, - legend=legend, - xaxis=xaxis, - yaxis=yaxis, - ) - ) - - BasePlot._fig.used_models.extend(models) - return self._plot( - ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), - xlabel="Threshold", - ylabel="Score", - title=title, - legend=legend, - figsize=figsize, - plotname="plot_threshold", - filename=filename, - display=display, - ) - - -@typechecked -class ShapPlot(BasePlot): - """Shap plots. - - ATOM wrapper for plots made by the shap package, using Shapley - values for model interpretation. These plots are accessible from - the runners or from the models. Only one model can be plotted at - the same time since the plots are not made by ATOM. - - """ - - @composed(crash, plot_from_model(max_one=True)) - def plot_shap_bar( - self, - models: INT | str | MODEL | None = None, - index: SLICE | None = None, - show: INT | None = None, - target: INT | str | tuple = 1, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> plt.Figure | None: - """Plot SHAP's bar plot. - - Create a bar plot of a set of SHAP values. If a single sample - is passed, then the SHAP values are plotted. If many samples - are passed, then the mean absolute value for each feature - column is plotted. Read more about SHAP plots in the - [user guide][shap]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_shap_bar()`. - - index: int, str, slice, sequence or None, default=None - Rows in the dataset to plot. If None, it selects all rows - in the test set. - - show: int or None, default=None - Number of features (ordered by importance) to show. If - None, it shows all features. - - target: int, str or tuple, default=1 - Class in the target column to target. For multioutput tasks, - the value should be a tuple of the form (column, class). - Note that for binary and multilabel tasks, the selected - class is always the positive one. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of features shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as png. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [plt.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_parshap - atom.plots:ShapPlot.plot_shap_beeswarm - atom.plots:ShapPlot.plot_shap_scatter - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run("LR") - atom.plot_shap_bar(show=10) - ``` - - """ - rows = models.X.loc[models.branch._get_rows(index)] - show = self._get_show(show, models) - target = self.branch._get_target(target) - explanation = models._shap.get_explanation(rows, target) - - self._get_figure(backend="matplotlib") - check_canvas(BasePlot._fig.is_canvas, "plot_shap_bar") - - shap.plots.bar(explanation, max_display=show, show=False) - - BasePlot._fig.used_models.append(models) - return self._plot( - ax=plt.gca(), - xlabel=plt.gca().get_xlabel(), - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - plotname="plot_shap_bar", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(max_one=True)) - def plot_shap_beeswarm( - self, - models: INT | str | MODEL | None = None, - index: slice | SEQUENCE | None = None, - show: INT | None = None, - target: INT | str | tuple = 1, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> plt.Figure | None: - """Plot SHAP's beeswarm plot. - - The plot is colored by feature values. Read more about SHAP - plots in the [user guide][shap]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_shap_beeswarm()`. - - index: tuple, slice or None, default=None - Rows in the dataset to plot. If None, it selects all rows - in the test set. The beeswarm plot does not support plotting - a single sample. - - show: int or None, default=None - Number of features (ordered by importance) to show. If - None, it shows all features. - - target: int, str or tuple, default=1 - Class in the target column to target. For multioutput tasks, - the value should be a tuple of the form (column, class). - Note that for binary and multilabel tasks, the selected - class is always the positive one. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of features shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as png. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [plt.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:PredictionPlot.plot_parshap - atom.plots:ShapPlot.plot_shap_bar - atom.plots:ShapPlot.plot_shap_scatter - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run("LR") - atom.plot_shap_beeswarm(show=10) - ``` - - """ - rows = models.X.loc[models.branch._get_rows(index)] - show = self._get_show(show, models) - target = self.branch._get_target(target) - explanation = models._shap.get_explanation(rows, target) - - self._get_figure(backend="matplotlib") - check_canvas(BasePlot._fig.is_canvas, "plot_shap_beeswarm") - - shap.plots.beeswarm(explanation, max_display=show, show=False) - - BasePlot._fig.used_models.append(models) - return self._plot( - ax=plt.gca(), - xlabel=plt.gca().get_xlabel(), - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(max_one=True)) - def plot_shap_decision( - self, - models: INT | str | MODEL | None = None, - index: SLICE | None = None, - show: INT | None = None, - target: INT | str | tuple = 1, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> plt.Figure | None: - """Plot SHAP's decision plot. - - Visualize model decisions using cumulative SHAP values. Each - plotted line explains a single model prediction. If a single - prediction is plotted, feature values are printed in the - plot (if supplied). If multiple predictions are plotted - together, feature values will not be printed. Plotting too - many predictions together will make the plot unintelligible. - Read more about SHAP plots in the [user guide][shap]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_shap_decision()`. - - index: int, str, slice, sequence or None, default=None - Rows in the dataset to plot. If None, it selects all rows - in the test set. - - show: int or None, default=None - Number of features (ordered by importance) to show. If - None, it shows all features. - - target: int, str or tuple, default=1 - Class in the target column to target. For multioutput tasks, - the value should be a tuple of the form (column, class). - Note that for binary and multilabel tasks, the selected - class is always the positive one. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of features shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as png. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [plt.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:ShapPlot.plot_shap_bar - atom.plots:ShapPlot.plot_shap_beeswarm - atom.plots:ShapPlot.plot_shap_force - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run("LR") - atom.plot_shap_decision(show=10) - atom.plot_shap_decision(index=-1, show=10) - ``` - - """ - rows = models.X.loc[models.branch._get_rows(index)] - show = self._get_show(show, models) - target = self.branch._get_target(target) - explanation = models._shap.get_explanation(rows, target) - - self._get_figure(backend="matplotlib") - check_canvas(BasePlot._fig.is_canvas, "plot_shap_decision") - - shap.decision_plot( - base_value=explanation.base_values, - shap_values=explanation.values, - features=rows, - feature_display_range=slice(-1, -show - 1, -1), - auto_size_plot=False, - show=False, - ) - - BasePlot._fig.used_models.append(models) - return self._plot( - ax=plt.gca(), - xlabel=plt.gca().get_xlabel(), - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - plotname="plot_shap_decision", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(max_one=True)) - def plot_shap_force( - self, - models: INT | str | MODEL | None = None, - index: SLICE | None = None, - target: INT | str | tuple = 1, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] = (900, 300), - filename: str | None = None, - display: bool | None = True, - **kwargs, - ) -> plt.Figure | None: - """Plot SHAP's force plot. - - Visualize the given SHAP values with an additive force layout. - Note that by default this plot will render using javascript. - For a regular figure use `matplotlib=True` (this option is - only available when only a single sample is plotted). Read more - about SHAP plots in the [user guide][shap]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_shap_force()`. - - index: int, str, slice, sequence or None, default=None - Rows in the dataset to plot. If None, it selects all rows - in the test set. - - target: int, str or tuple, default=1 - Class in the target column to target. For multioutput tasks, - the value should be a tuple of the form (column, class). - Note that for binary and multilabel tasks, the selected - class is always the positive one. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=(900, 300) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as png. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - **kwargs - Additional keyword arguments for [shap.plots.force][force]. - - Returns - ------- - [plt.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:ShapPlot.plot_shap_beeswarm - atom.plots:ShapPlot.plot_shap_scatter - atom.plots:ShapPlot.plot_shap_decision - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run("LR") - atom.plot_shap_force(index=-2, matplotlib=True, figsize=(1800, 300)) - ``` - - """ - rows = models.X.loc[models.branch._get_rows(index)] - target = self.branch._get_target(target) - explanation = models._shap.get_explanation(rows, target) - - self._get_figure(create_figure=False, backend="matplotlib") - check_canvas(BasePlot._fig.is_canvas, "plot_shap_force") - - plot = shap.force_plot( - base_value=explanation.base_values, - shap_values=explanation.values, - features=rows, - show=False, - **kwargs, - ) - - if kwargs.get("matplotlib"): - BasePlot._fig.used_models.append(models) - return self._plot( - fig=plt.gcf(), - ax=plt.gca(), - title=title, - legend=legend, - figsize=figsize, - plotname="plot_shap_force", - filename=filename, - display=display, - ) - else: - if filename: # Save to a html file - if not filename.endswith(".html"): - filename += ".html" - shap.save_html(filename, plot) - if display and find_spec("IPython"): - from IPython.display import display - - shap.initjs() - display(plot) - - @composed(crash, plot_from_model(max_one=True)) - def plot_shap_heatmap( - self, - models: INT | str | MODEL | None = None, - index: slice | SEQUENCE | None = None, - show: INT | None = None, - target: INT | str | tuple = 1, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> plt.Figure | None: - """Plot SHAP's heatmap plot. - - This plot is designed to show the population substructure of a - dataset using supervised clustering and a heatmap. Supervised - clustering involves clustering data points not by their original - feature values but by their explanations. Read more about SHAP - plots in the [user guide][shap]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_shap_heatmap()`. - - index: slice, sequence or None, default=None - Rows in the dataset to plot. If None, it selects all rows - in the test set. The plot_shap_heatmap method does not - support plotting a single sample. - - show: int or None, default=None - Number of features (ordered by importance) to show. If - None, it shows all features. - - target: int, str or tuple, default=1 - Class in the target column to target. For multioutput tasks, - the value should be a tuple of the form (column, class). - Note that for binary and multilabel tasks, the selected - class is always the positive one. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of features shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as png. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [plt.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:ShapPlot.plot_shap_decision - atom.plots:ShapPlot.plot_shap_force - atom.plots:ShapPlot.plot_shap_waterfall - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run("LR") - atom.plot_shap_heatmap(show=10) - ``` - - """ - rows = models.X.loc[models.branch._get_rows(index)] - show = self._get_show(show, models) - target = self.branch._get_target(target) - explanation = models._shap.get_explanation(rows, target) - - self._get_figure(backend="matplotlib") - check_canvas(BasePlot._fig.is_canvas, "plot_shap_heatmap") - - shap.plots.heatmap(explanation, max_display=show, show=False) - - BasePlot._fig.used_models.append(models) - return self._plot( - ax=plt.gca(), - xlabel=plt.gca().get_xlabel(), - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - plotname="plot_shap_heatmap", - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(max_one=True)) - def plot_shap_scatter( - self, - models: INT | str | MODEL | None = None, - index: slice | SEQUENCE | None = None, - columns: INT | str = 0, - target: INT | str | tuple = 1, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] = (900, 600), - filename: str | None = None, - display: bool | None = True, - ) -> plt.Figure | None: - """Plot SHAP's scatter plot. - - Plots the value of the feature on the x-axis and the SHAP value - of the same feature on the y-axis. This shows how the model - depends on the given feature, and is like a richer extension of - the classical partial dependence plots. Vertical dispersion of - the data points represents interaction effects. Read more about - SHAP plots in the [user guide][shap]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_shap_scatter()`. - - index: slice, sequence or None, default=None - Rows in the dataset to plot. If None, it selects all rows - in the test set. The plot_shap_scatter method does not - support plotting a single sample. - - columns: int or str, default=0 - Column to plot. - - target: int, str or tuple, default=1 - Class in the target column to target. For multioutput tasks, - the value should be a tuple of the form (column, class). - Note that for binary and multilabel tasks, the selected - class is always the positive one. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=(900, 600) - Figure's size in pixels, format as (x, y). - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as png. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [plt.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:ShapPlot.plot_shap_beeswarm - atom.plots:ShapPlot.plot_shap_decision - atom.plots:ShapPlot.plot_shap_force - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run("LR") - atom.plot_shap_scatter(columns="symmetry error") - ``` - - """ - rows = models.X.loc[models.branch._get_rows(index)] - column = models.branch._get_columns(columns, include_target=False)[0] - target = self.branch._get_target(target) - explanation = models._shap.get_explanation(rows, target) - - # Get explanation for a specific column - explanation = explanation[:, models.columns.get_loc(column)] - - self._get_figure(backend="matplotlib") - check_canvas(BasePlot._fig.is_canvas, "plot_shap_scatter") - - shap.plots.scatter(explanation, color=explanation, ax=plt.gca(), show=False) - - BasePlot._fig.used_models.append(models) - return self._plot( - ax=plt.gca(), - xlabel=plt.gca().get_xlabel(), - ylabel=plt.gca().get_ylabel(), - title=title, - legend=legend, - plotname="plot_shap_scatter", - figsize=figsize, - filename=filename, - display=display, - ) - - @composed(crash, plot_from_model(max_one=True)) - def plot_shap_waterfall( - self, - models: INT | str | MODEL | None = None, - index: INT | str | None = None, - show: INT | None = None, - target: INT | str | tuple = 1, - *, - title: str | dict | None = None, - legend: str | dict | None = None, - figsize: tuple[INT, INT] | None = None, - filename: str | None = None, - display: bool | None = True, - ) -> plt.Figure | None: - """Plot SHAP's waterfall plot. - - The SHAP value of a feature represents the impact of the - evidence provided by that feature on the model’s output. The - waterfall plot is designed to visually display how the SHAP - values (evidence) of each feature move the model output from - our prior expectation under the background data distribution, - to the final model prediction given the evidence of all the - features. Features are sorted by the magnitude of their SHAP - values with the smallest magnitude features grouped together - at the bottom of the plot when the number of features in the - models exceeds the `show` parameter. Read more about SHAP plots - in the [user guide][shap]. - - Parameters - ---------- - models: int, str, Model or None, default=None - Model to plot. If None, all models are selected. Note that - leaving the default option could raise an exception if there - are multiple models. To avoid this, call the plot directly - from a model, e.g. `atom.lr.plot_shap_waterfall()`. - - index: int, str or None, default=None - Rows in the dataset to plot. If None, it selects all rows - in the test set. The plot_shap_waterfall method does not - support plotting multiple samples. - - show: int or None, default=None - Number of features (ordered by importance) to show. If - None, it shows all features. - - target: int, str or tuple, default=1 - Class in the target column to target. For multioutput tasks, - the value should be a tuple of the form (column, class). - Note that for binary and multilabel tasks, the selected - class is always the positive one. - - title: str, dict or None, default=None - Title for the plot. - - - If None, no title is shown. - - If str, text for the title. - - If dict, [title configuration][parameters]. - - legend: str, dict or None, default=None - Does nothing. Implemented for continuity of the API. - - figsize: tuple or None, default=None - Figure's size in pixels, format as (x, y). If None, it - adapts the size to the number of features shown. - - filename: str or None, default=None - Save the plot using this name. Use "auto" for automatic - naming. The type of the file depends on the provided name - (.html, .png, .pdf, etc...). If `filename` has no file type, - the plot is saved as png. If None, the plot is not saved. - - display: bool or None, default=True - Whether to render the plot. If None, it returns the figure. - - Returns - ------- - [plt.Figure][] or None - Plot object. Only returned if `display=None`. - - See Also - -------- - atom.plots:ShapPlot.plot_shap_bar - atom.plots:ShapPlot.plot_shap_beeswarm - atom.plots:ShapPlot.plot_shap_heatmap - - Examples - -------- - ```pycon - from atom import ATOMClassifier - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True, as_frame=True) - - atom = ATOMClassifier(X, y, random_state=1) - atom.run("LR") - atom.plot_shap_waterfall(show=10) - ``` - - """ - rows = models.X.loc[[models.branch._get_rows(index)[0]]] - show = self._get_show(show, models) - target = self.branch._get_target(target) - explanation = models._shap.get_explanation(rows, target) - - # Waterfall accepts only one row - explanation.values = explanation.values[0] - explanation.data = explanation.data[0] - - self._get_figure(backend="matplotlib") - check_canvas(BasePlot._fig.is_canvas, "plot_shap_waterfall") - - shap.plots.waterfall(explanation, max_display=show, show=False) - - BasePlot._fig.used_models.append(models) - return self._plot( - ax=plt.gca(), - title=title, - legend=legend, - figsize=figsize or (900, 400 + show * 50), - plotname="plot_shap_waterfall", - filename=filename, - display=display, - ) diff --git a/atom/plots/__init__.py b/atom/plots/__init__.py new file mode 100644 index 000000000..765a2ac2e --- /dev/null +++ b/atom/plots/__init__.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module for plots. + +""" + +from atom.plots.dataplot import DataPlot +from atom.plots.featureselectionplot import FeatureSelectionPlot +from atom.plots.hyperparametertuningplot import HyperparameterTuningPlot +from atom.plots.predictionplot import PredictionPlot +from atom.plots.shapplot import ShapPlot + + +class ATOMPlot( + FeatureSelectionPlot, + DataPlot, + HyperparameterTuningPlot, + PredictionPlot, + ShapPlot, +): + """Plot classes inherited by main ATOM classes.""" + pass + + +class RunnerPlot(HyperparameterTuningPlot, PredictionPlot, ShapPlot): + """Plot classes inherited by the runners and callable from models.""" + pass diff --git a/atom/plots/base.py b/atom/plots/base.py new file mode 100644 index 000000000..7028c6ce9 --- /dev/null +++ b/atom/plots/base.py @@ -0,0 +1,1117 @@ +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module containing the base classes for plotting. + +""" + +from __future__ import annotations + +from contextlib import contextmanager +from dataclasses import dataclass +from itertools import cycle +from typing import Literal + +import matplotlib.pyplot as plt +import plotly.express as px +import plotly.graph_objects as go +from mlflow.tracking import MlflowClient +from typeguard import typechecked + +from atom.utils.constants import PALETTE +from atom.utils.types import ( + BOOL, DATAFRAME, FLOAT, INDEX, INT, INT_TYPES, LEGEND, MODEL, SCALAR, + SEQUENCE, +) +from atom.utils.utils import ( + composed, crash, divide, get_custom_scorer, lst, rnd, to_rgb, +) + + +@dataclass +class Aesthetics: + """Keeps track of plot aesthetics.""" + + palette: SEQUENCE # Sequence of colors + title_fontsize: INT # Fontsize for titles + label_fontsize: INT # Fontsize for labels, legend and hoverinfo + tick_fontsize: INT # Fontsize for ticks + line_width: INT # Width of the line plots + marker_size: INT # Size of the markers + + +@typechecked +class BaseFigure: + """Base plotly figure. + + The instance stores the position of the current axes in grid, + as well as the models used for the plot (to track in mlflow). + + Parameters + ---------- + rows: int, default=1 + Number of subplot rows in the canvas. + + cols: int, default=1 + Number of subplot columns in the canvas. + + horizontal_spacing: float, default=0.05 + Space between subplot rows in normalized plot coordinates. + The spacing is relative to the figure's size. + + vertical_spacing: float, default=0.07 + Space between subplot cols in normalized plot coordinates. + The spacing is relative to the figure's size. + + palette: str or sequence, default="Prism" + Name or color sequence for the palette. + + is_canvas: bool, default=False + Whether the figure shows multiple plots. + + backend: str, default="plotly" + Figure's backend. Choose between plotly or matplotlib. + + create_figure: bool, default=True + Whether to create a new figure. + + """ + + _marker = ["circle", "x", "diamond", "pentagon", "star", "hexagon"] + _dash = [None, "dashdot", "dash", "dot", "longdash", "longdashdot"] + _shape = ["", "/", "x", "\\", "-", "|", "+", "."] + + def __init__( + self, + rows: INT = 1, + cols: INT = 1, + horizontal_spacing: FLOAT = 0.05, + vertical_spacing: FLOAT = 0.07, + palette: str | SEQUENCE = "Prism", + is_canvas: BOOL = False, + backend: Literal["plotly", "matplotlib"] = "plotly", + create_figure: BOOL = True, + ): + self.rows = rows + self.cols = cols + self.horizontal_spacing = horizontal_spacing + self.vertical_spacing = vertical_spacing + if isinstance(palette, str): + self._palette = getattr(px.colors.qualitative, palette) + self.palette = cycle(self._palette) + else: + # Convert color names or hex to rgb + self._palette = list(map(to_rgb, palette)) + self.palette = cycle(self._palette) + self.is_canvas = is_canvas + self.backend = backend + self.create_figure = create_figure + + self.idx = 0 # N-th plot in the canvas + self.axes = 0 # N-th axis in the canvas + if self.create_figure: + if self.backend == "plotly": + self.figure = go.Figure() + else: + self.figure, _ = plt.subplots() + + self.groups = [] + self.style = dict(palette={}, marker={}, dash={}, shape={}) + self.marker = cycle(self._marker) + self.dash = cycle(self._dash) + self.shape = cycle(self._shape) + + self.pos = {} # Subplot position to use for title + self.custom_layout = {} # Layout params specified by user + self.used_models = [] # Models plotted in this figure + + # Perform parameter checks + if not 0 < horizontal_spacing < 1: + raise ValueError( + "Invalid value for the horizontal_spacing parameter. The " + f"value must lie between 0 and 1, got {horizontal_spacing}." + ) + + if not 0 < vertical_spacing < 1: + raise ValueError( + "Invalid value for the vertical_spacing parameter. The " + f"value must lie between 0 and 1, got {vertical_spacing}." + ) + + @property + def grid(self) -> tuple[INT, INT]: + """Position of the current axes on the grid. + + Returns + ------- + int + X-position. + + int + Y-position. + + """ + return (self.idx - 1) // self.cols + 1, self.idx % self.cols or self.cols + + @property + def next_subplot(self) -> go.Figure | plt.Figure | None: + """Increase the subplot index. + + Returns + ------- + go.Figure, plt.Figure or None + Current figure. Returns None if `create_figure=False`. + + """ + # Check if there are too many plots in the canvas + if self.idx >= self.rows * self.cols: + raise ValueError( + "Invalid number of plots in the canvas! Increase " + "the number of rows and cols to add more plots." + ) + else: + self.idx += 1 + + if self.create_figure: + return self.figure + + def get_elem( + self, + name: SCALAR | str | None = None, + element: Literal["palette", "marker", "dash", "shape"] = "palette", + ) -> str | None: + """Get the plot element for a specific name. + + This method is used to assign the same element (color, marker, + etc...) to the same columns and models in a plot. + + Parameters + ---------- + name: int, float or str or None, default=None + Name for which to get the plot element. The name is stored in + the element attributes to assign the same element to all calls + with the same name. If None, return the first element. + + element: str, default="palette" + Plot element to get. Choose from: palette, marker, dash, shape. + + Returns + ------- + str or None + Element code. + + """ + if name is None: + return getattr(self, f"_{element}")[0] # Get first element (default) + elif name in self.style[element]: + return self.style[element][name] + else: + return self.style[element].setdefault(name, next(getattr(self, element))) + + def showlegend(self, name: str, legend: LEGEND | dict | None) -> BOOL: + """Get whether the trace should be showed in the legend. + + If there's already a trace with the same name, it's not + necessary to show it in the plot's legend. + + Parameters + ---------- + name: str + Name of the trace. + + legend: str, dict or None + Legend parameter. + + Returns + ------- + bool + Whether the trace should be placed in the legend. + + """ + if name in self.groups: + return False + else: + self.groups.append(name) + return legend is not None + + def get_axes( + self, + x: tuple[SCALAR, SCALAR] = (0, 1), + y: tuple[SCALAR, SCALAR] = (0, 1), + coloraxis: dict | None = None, + ) -> tuple[str, str]: + """Create and update the plot's axes. + + Parameters + ---------- + x: tuple + Relative x-size of the plot. + + y: tuple + Relative y-size of the plot. + + coloraxis: dict or None + Properties of the coloraxis to create. None to ignore. + + Returns + ------- + str + Name of the x-axis. + + str + Name of the y-axis. + + """ + self.axes += 1 + + # Calculate the distance between subplots + x_offset = divide(self.horizontal_spacing, (self.cols - 1)) + y_offset = divide(self.vertical_spacing, (self.rows - 1)) + + # Calculate the size of the subplot + x_size = (1 - ((x_offset * 2) * (self.cols - 1))) / self.cols + y_size = (1 - ((y_offset * 2) * (self.rows - 1))) / self.rows + + # Calculate the size of the axes + ax_size = (x[1] - x[0]) * x_size + ay_size = (y[1] - y[0]) * y_size + + # Determine the position for the axes + x_pos = (self.grid[1] - 1) * (x_size + 2 * x_offset) + x[0] * x_size + y_pos = (self.rows - self.grid[0]) * (y_size + 2 * y_offset) + y[0] * y_size + + # Store positions for subplot title + self.pos[str(self.axes)] = (x_pos + ax_size / 2, rnd(y_pos + ay_size)) + + # Update the figure with the new axes + self.figure.update_layout( + { + f"xaxis{self.axes}": dict( + domain=(x_pos, rnd(x_pos + ax_size)), anchor=f"y{self.axes}" + ), + f"yaxis{self.axes}": dict( + domain=(y_pos, rnd(y_pos + ay_size)), anchor=f"x{self.axes}" + ), + } + ) + + # Place a colorbar right of the axes + if coloraxis: + if title := coloraxis.pop("title", None): + coloraxis["colorbar_title"] = dict( + text=title, side="right", font_size=coloraxis.pop("font_size") + ) + + coloraxis["colorbar_x"] = rnd(x_pos + ax_size) + ax_size / 40 + coloraxis["colorbar_xanchor"] = "left" + coloraxis["colorbar_y"] = y_pos + ay_size / 2 + coloraxis["colorbar_yanchor"] = "middle" + coloraxis["colorbar_len"] = ay_size * 0.9 + coloraxis["colorbar_thickness"] = ax_size * 30 # Default width in pixels + self.figure.update_layout( + {f"coloraxis{coloraxis.pop('axes', self.axes)}": coloraxis} + ) + + xaxis = f"x{self.axes if self.axes > 1 else ''}" + yaxis = f"y{self.axes if self.axes > 1 else ''}" + return xaxis, yaxis + + +@typechecked +class BasePlot: + """Base class for all plotting methods. + + This base class defines the properties that can be changed + to customize the plot's aesthetics. + + """ + + _fig = None + _custom_layout = {} + _custom_traces = {} + _aesthetics = Aesthetics( + palette=list(PALETTE), + title_fontsize=24, + label_fontsize=16, + tick_fontsize=12, + line_width=2, + marker_size=8, + ) + + # Properties =================================================== >> + + @property + def aesthetics(self) -> Aesthetics: + """All plot aesthetic attributes.""" + return self._aesthetics + + @aesthetics.setter + def aesthetics(self, value: dict): + self.palette = value.get("palette", self.palette) + self.title_fontsize = value.get("title_fontsize", self.title_fontsize) + self.label_fontsize = value.get("label_fontsize", self.label_fontsize) + self.tick_fontsize = value.get("tick_fontsize", self.tick_fontsize) + self.line_width = value.get("line_width", self.line_width) + self.marker_size = value.get("marker_size", self.marker_size) + + @property + def palette(self) -> str | SEQUENCE: + """Color palette. + + Specify one of plotly's [built-in palettes][palette] or create + a custom one, e.g. `atom.palette = ["red", "green", "blue"]`. + + """ + return self._aesthetics.palette + + @palette.setter + def palette(self, value: str | SEQUENCE): + if isinstance(value, str) and not hasattr(px.colors.qualitative, value): + raise ValueError( + f"Invalid value for the palette parameter, got {value}. Choose " + f"from one of plotly's built-in qualitative color sequences in " + f"the px.colors.qualitative module or define your own sequence." + ) + + self._aesthetics.palette = value + + @property + def title_fontsize(self) -> INT: + """Fontsize for the plot's title.""" + return self._aesthetics.title_fontsize + + @title_fontsize.setter + def title_fontsize(self, value: INT): + if value <= 0: + raise ValueError( + "Invalid value for the title_fontsize parameter. " + f"Value should be >=0, got {value}." + ) + + self._aesthetics.title_fontsize = value + + @property + def label_fontsize(self) -> INT: + """Fontsize for the labels, legend and hover information.""" + return self._aesthetics.label_fontsize + + @label_fontsize.setter + def label_fontsize(self, value: INT): + if value <= 0: + raise ValueError( + "Invalid value for the label_fontsize parameter. " + f"Value should be >=0, got {value}." + ) + + self._aesthetics.label_fontsize = value + + @property + def tick_fontsize(self) -> INT: + """Fontsize for the ticks along the plot's axes.""" + return self._aesthetics.tick_fontsize + + @tick_fontsize.setter + def tick_fontsize(self, value: INT): + if value <= 0: + raise ValueError( + "Invalid value for the tick_fontsize parameter. " + f"Value should be >=0, got {value}." + ) + + self._aesthetics.tick_fontsize = value + + @property + def line_width(self) -> INT: + """Width of the line plots.""" + return self._aesthetics.line_width + + @line_width.setter + def line_width(self, value: INT): + if value <= 0: + raise ValueError( + "Invalid value for the line_width parameter. " + f"Value should be >=0, got {value}." + ) + + self._aesthetics.line_width = value + + @property + def marker_size(self) -> INT: + """Size of the markers.""" + return self._aesthetics.marker_size + + @marker_size.setter + def marker_size(self, value: INT): + if value <= 0: + raise ValueError( + "Invalid value for the marker_size parameter. " + f"Value should be >=0, got {value}." + ) + + self._aesthetics.marker_size = value + + # Methods ====================================================== >> + + @staticmethod + def _get_plot_index(df: DATAFRAME) -> INDEX: + """Return the dataset's index in a plottable format. + + Plotly does not accept all index formats (e.g. pd.Period), + thus use this utility method to convert to timestamp those + indices that can, else return as is. + + Parameters + ---------- + df: dataframe + Data set to get the index from. + + Returns + ------- + index + Index in an acceptable format. + + """ + if hasattr(df.index, "to_timestamp"): + return df.index.to_timestamp() + else: + return df.index + + @staticmethod + def _get_show(show: INT | None, model: MODEL | list[MODEL]) -> INT: + """Check and return the number of features to show. + + Parameters + ---------- + show: int or None + Number of features to show. If None, select all (max 200). + + model: Model or list + Models from which to get the features. + + Returns + ------- + int + Number of features to show. + + """ + max_fxs = max(m.n_features for m in lst(model)) + if show is None or show > max_fxs: + # Limit max features shown to avoid maximum figsize error + show = min(200, max_fxs) + elif show < 1: + raise ValueError( + f"Invalid value for the show parameter. Value should be >0, got {show}." + ) + + return show + + @staticmethod + def _get_hyperparams( + params: str | slice | SEQUENCE | None, + model: MODEL, + ) -> list[str]: + """Check and return a model's hyperparameters. + + Parameters + ---------- + params: str, slice, sequence or None + Hyperparameters to get. Use a sequence or add `+` between + options to select more than one. If None, all the model's + hyperparameters are selcted. + + model: Model + Get the params from this model. + + Returns + ------- + list of str + Selected hyperparameters. + + """ + if params is None: + hyperparameters = list(model._ht["distributions"]) + elif isinstance(params, slice): + hyperparameters = list(model._ht["distributions"])[params] + else: + hyperparameters = [] + for param in lst(params): + if isinstance(param, INT_TYPES): + hyperparameters.append(list(model._ht["distributions"])[param]) + elif isinstance(param, str): + for p in param.split("+"): + if p not in model._ht["distributions"]: + raise ValueError( + "Invalid value for the params parameter. " + f"Hyperparameter {p} was not used during the " + f"optimization of model {model.name}." + ) + else: + hyperparameters.append(p) + + if not hyperparameters: + raise ValueError(f"Didn't find any hyperparameters for model {model.name}.") + + return hyperparameters + + def _get_metric( + self, + metric: INT | str | SEQUENCE | None, + max_one: BOOL, + ) -> INT | str | list[INT | str]: + """Check and return the provided metric index. + + Parameters + ---------- + metric: int, str, sequence or None + Metric to retrieve. If None, all metrics are returned. + + max_one: bool + Whether one or multiple metrics are allowed. + + Returns + ------- + int or list + Position index of the metric. If `max_one=False`, returns + a list of metric positions. + + """ + if metric is None: + return list(range(len(self._metric))) + else: + inc = [] + for met in lst(metric): + if isinstance(met, INT_TYPES): + if 0 <= met < len(self._metric): + inc.append(met) + else: + raise ValueError( + f"Invalid value for the metric parameter. Value {met} is out " + f"of range for a pipeline with {len(self._metric)} metrics." + ) + elif isinstance(met, str): + met = met.lower() + for m in met.split("+"): + if m in ("time_ht", "time_fit", "time_bootstrap", "time"): + inc.append(m) + elif (name := get_custom_scorer(m).name) in self.metric: + inc.append(self._metric.index(name)) + else: + raise ValueError( + "Invalid value for the metric parameter. The " + f"{name} metric wasn't used to fit the models." + ) + + if len(inc) > 1 and max_one: + raise ValueError( + "Invalid value for the metric parameter. " + f"Only one metric is allowed, got {inc}." + ) + + return inc[0] if max_one else inc + + def _get_set( + self, + dataset: str | SEQUENCE, + max_one: BOOL, + allow_holdout: BOOL = True, + ) -> str | list[str]: + """Check and return the provided data set. + + Parameters + ---------- + dataset: str or sequence + Name(s) of the data set to retrieve. + + max_one: bool + Whether one or multiple data sets are allowed. If True, return + the data set instead of a list. + + allow_holdout: bool, default=True + Whether to allow the retrieval of the holdout set. + + Returns + ------- + str or list + Selected data set(s). + + """ + for ds in (dataset := "+".join(lst(dataset)).lower().split("+")): + if ds == "holdout": + if allow_holdout: + if self.holdout is None: + raise ValueError( + "Invalid value for the dataset parameter. No holdout " + "data set was specified when initializing the instance." + ) + else: + raise ValueError( + "Invalid value for the dataset parameter, got " + f"{ds}. Choose from: train, test." + ) + elif ds not in ("train", "test"): + raise ValueError( + "Invalid value for the dataset parameter, got {ds}. " + f"Choose from: train, test{', holdout' if allow_holdout else ''}." + ) + + if max_one and len(dataset) > 1: + raise ValueError( + "Invalid value for the dataset parameter, got " + f"{dataset}. Only one data set is allowed." + ) + + return dataset[0] if max_one else dataset + + def _get_figure(self, **kwargs) -> go.Figure | plt.Figure | None: + """Return existing figure if in canvas, else a new figure. + + Every time this method is called from a canvas, the plot + index is raised by one to keep track in which subplot the + BaseFigure is at. + + Parameters + ---------- + **kwargs + Additional keyword arguments for BaseFigure. + + Returns + ------- + [go.Figure][], [plt.Figure][] or None + Existing figure or newly created. Returns None if kwarg + `create_figure=False`. + + """ + if BasePlot._fig and BasePlot._fig.is_canvas: + return BasePlot._fig.next_subplot + else: + BasePlot._fig = BaseFigure(palette=self.palette, **kwargs) + return BasePlot._fig.next_subplot + + def _draw_line( + self, + parent: str, + child: str | None = None, + legend: str | dict = None, + **kwargs, + ) -> go.Scatter: + """Draw a line. + + Unify the style to draw a line, where parent and child + (e.g. model - data set or column - distribution) keep the + same style (color or dash). A legendgroup title is only added + when there is a child element. + + Parameters + ---------- + parent: str + Name of the model. + + child: str or None, default=None + Data set which is plotted. + + legend: str, dict or None + Legend argument provided by the user. + + **kwargs + Additional keyword arguments for the trace. + + Returns + ------- + go.Scatter + New trace to add to figure. + + """ + legendgrouptitle = dict(text=parent, font_size=self.label_fontsize) + hover = f"(%{{x}}, %{{y}}){parent}{f' - {child}' if child else ''}" + return go.Scatter( + line=dict( + width=self.line_width, + color=BasePlot._fig.get_elem(parent), + dash=BasePlot._fig.get_elem(child, "dash"), + ), + marker=dict( + symbol=BasePlot._fig.get_elem(child, "marker"), + size=self.marker_size, + color=BasePlot._fig.get_elem(parent), + line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), + ), + hovertemplate=kwargs.pop("hovertemplate", hover), + name=kwargs.pop("name", child or parent), + legendgroup=kwargs.pop("legendgroup", parent), + legendgrouptitle=legendgrouptitle if child else None, + showlegend=BasePlot._fig.showlegend(f"{parent}-{child}", legend), + **kwargs, + ) + + @staticmethod + def _draw_straight_line(y: SCALAR | str, xaxis: str, yaxis: str): + """Draw a line across the axis. + + The line can be either horizontal or diagonal. The line should + be used as reference. It's not added to the legend and doesn't + show any information on hover. + + Parameters + ---------- + y: int, float or str, default = "diagonal" + Coordinates on the y-axis. If a value, draw a horizontal line + at that value. If "diagonal", draw a diagonal line from x. + + xaxis: str + Name of the x-axis to draw in. + + yaxis: str + Name of the y-axis to draw in. + + """ + BasePlot._fig.figure.add_shape( + type="line", + x0=0, + x1=1, + y0=0 if y == "diagonal" else y, + y1=1 if y == "diagonal" else y, + xref=f"{xaxis} domain", + yref=f"{yaxis} domain" if y == "diagonal" else yaxis, + line=dict(width=1, color="black", dash="dash"), + opacity=0.6, + layer="below", + ) + + def _plot( + self, + fig: go.Figure | plt.Figure | None = None, + ax: plt.Axes | tuple[str, str] | None = None, + **kwargs, + ) -> go.Figure | plt.Figure | None: + """Make the plot. + + Customize the axes to the default layout and plot the figure + if it's not part of a canvas. + + Parameters + ---------- + fig: go.Figure, plt.Figure or None + Current figure. If None, use `plt.gcf()`. + + ax: plt.Axes, tuple or None, default=None + Axis object or names of the axes to update. If None, ignore + their update. + + **kwargs + Keyword arguments containing the figure's parameters. + + - title: Name of the title or custom configuration. + - legend: Whether to show the legend or custom configuration. + - xlabel: Label for the x-axis. + - ylabel: Label for the y-axis. + - xlim: Limits for the x-axis. + - ylim: Limits for the y-axis. + - figsize: Size of the figure. + - filename: Name of the saved file. + - plotname: Name of the plot. + - display: Whether to show the plot. If None, return the figure. + + Returns + ------- + plt.Figure, go.Figure or None + Created figure. Only returned if `display=None`. + + """ + # Set name with which to save the file + if kwargs.get("filename"): + if kwargs["filename"].endswith("auto"): + name = kwargs["filename"].replace("auto", kwargs["plotname"]) + else: + name = kwargs["filename"] + else: + name = kwargs.get("plotname") + + fig = fig or BasePlot._fig.figure + if BasePlot._fig.backend == "plotly": + if ax: + fig.update_layout( + { + f"{ax[0]}_title": dict( + text=kwargs.get("xlabel"), font_size=self.label_fontsize + ), + f"{ax[1]}_title": dict( + text=kwargs.get("ylabel"), font_size=self.label_fontsize + ), + f"{ax[0]}_range": kwargs.get("xlim"), + f"{ax[1]}_range": kwargs.get("ylim"), + f"{ax[0]}_automargin": True, + f"{ax[1]}_automargin": True, + } + ) + + if BasePlot._fig.is_canvas and (title := kwargs.get("title")): + # Add a subtitle to a plot in the canvas + default_title = { + "x": BasePlot._fig.pos[ax[0][5:] or "1"][0], + "y": BasePlot._fig.pos[ax[0][5:] or "1"][1] + 0.005, + "xref": "paper", + "yref": "paper", + "xanchor": "center", + "yanchor": "bottom", + "showarrow": False, + "font_size": self.title_fontsize - 4, + } + + if isinstance(title, dict): + title = {**default_title, **title} + else: + title = {"text": title, **default_title} + + fig.update_layout(dict(annotations=fig.layout.annotations + (title,))) + + if not BasePlot._fig.is_canvas and kwargs.get("plotname"): + default_title = dict( + x=0.5, + y=1, + pad=dict(t=15, b=15), + xanchor="center", + yanchor="top", + xref="paper", + font_size=self.title_fontsize, + ) + if isinstance(title := kwargs.get("title"), dict): + title = {**default_title, **title} + else: + title = {"text": title, **default_title} + + default_legend = dict( + traceorder="grouped", + groupclick=kwargs.get("groupclick", "toggleitem"), + font_size=self.label_fontsize, + bgcolor="rgba(255, 255, 255, 0.5)", + ) + if isinstance(legend := kwargs.get("legend"), str): + position = {} + if legend == "upper left": + position = dict(x=0.01, y=0.99, xanchor="left", yanchor="top") + elif legend == "lower left": + position = dict(x=0.01, y=0.01, xanchor="left", yanchor="bottom") + elif legend == "upper right": + position = dict(x=0.99, y=0.99, xanchor="right", yanchor="top") + elif legend == "lower right": + position = dict(x=0.99, y=0.01, xanchor="right", yanchor="bottom") + elif legend == "upper center": + position = dict(x=0.5, y=0.99, xanchor="center", yanchor="top") + elif legend == "lower center": + position = dict(x=0.5, y=0.01, xanchor="center", yanchor="bottom") + elif legend == "center left": + position = dict(x=0.01, y=0.5, xanchor="left", yanchor="middle") + elif legend == "center right": + position = dict(x=0.99, y=0.5, xanchor="right", yanchor="middle") + elif legend == "center": + position = dict(x=0.5, y=0.5, xanchor="center", yanchor="middle") + legend = {**default_legend, **position} + elif isinstance(legend, dict): + legend = {**default_legend, **legend} + + # Update layout with predefined settings + space1 = self.title_fontsize if title.get("text") else 10 + space2 = self.title_fontsize * int(bool(fig.layout.annotations)) + fig.update_layout( + title=title, + legend=legend, + showlegend=bool(kwargs.get("legend")), + hoverlabel=dict(font_size=self.label_fontsize), + font_size=self.tick_fontsize, + margin=dict(l=50, b=50, r=0, t=25 + space1 + space2, pad=0), + width=kwargs["figsize"][0], + height=kwargs["figsize"][1], + ) + + # Update plot with custom settings + fig.update_traces(**self._custom_traces) + fig.update_layout(**self._custom_layout) + + if kwargs.get("filename"): + if "." not in name or name.endswith(".html"): + fig.write_html(name if "." in name else name + ".html") + else: + fig.write_image(name) + + # Log plot to mlflow run of every model visualized + if getattr(self, "experiment", None) and self.log_plots: + for m in set(BasePlot._fig.used_models): + MlflowClient().log_figure( + run_id=m._run.info.run_id, + figure=fig, + artifact_file=name if "." in name else f"{name}.html", + ) + + if kwargs.get("display") is True: + fig.show() + elif kwargs.get("display") is None: + return fig + + else: + if kwargs.get("title"): + ax.set_title(kwargs.get("title"), fontsize=self.title_fontsize, pad=20) + if kwargs.get("xlabel"): + ax.set_xlabel(kwargs["xlabel"], fontsize=self.label_fontsize, labelpad=12) + if kwargs.get("ylabel"): + ax.set_ylabel(kwargs["ylabel"], fontsize=self.label_fontsize, labelpad=12) + if ax is not None: + ax.tick_params(axis="both", labelsize=self.tick_fontsize) + + if kwargs.get("figsize"): + # Convert from pixels to inches + fig.set_size_inches( + kwargs["figsize"][0] // fig.get_dpi(), + kwargs["figsize"][1] // fig.get_dpi(), + ) + plt.tight_layout() + if kwargs.get("filename"): + fig.savefig(name) + + # Log plot to mlflow run of every model visualized + if self.experiment and self.log_plots: + for m in set(BasePlot._fig.used_models): + MlflowClient().log_figure( + run_id=m._run.info.run_id, + figure=fig, + artifact_file=name if "." in name else f"{name}.png", + ) + + plt.show() if kwargs.get("display") else plt.close() + if kwargs.get("display") is None: + return fig + + @composed(contextmanager, crash) + def canvas( + self, + rows: INT = 1, + cols: INT = 2, + *, + horizontal_spacing: FLOAT = 0.05, + vertical_spacing: FLOAT = 0.07, + title: str | dict | None = None, + legend: str | dict | None = "out", + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: BOOL = True, + ): + """Create a figure with multiple plots. + + This `@contextmanager` allows you to draw many plots in one + figure. The default option is to add two plots side by side. + See the [user guide][canvas] for an example. + + Parameters + ---------- + rows: int, default=1 + Number of plots in length. + + cols: int, default=2 + Number of plots in width. + + horizontal_spacing: float, default=0.05 + Space between subplot rows in normalized plot coordinates. + The spacing is relative to the figure's size. + + vertical_spacing: float, default=0.07 + Space between subplot cols in normalized plot coordinates. + The spacing is relative to the figure's size. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: bool, str or dict, default="out" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of plots in the canvas. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool, default=True + Whether to render the plot. + + Yields + ------ + [go.Figure][] + Plot object. + + """ + BasePlot._fig = BaseFigure( + rows=rows, + cols=cols, + horizontal_spacing=horizontal_spacing, + vertical_spacing=vertical_spacing, + palette=self.palette, + is_canvas=True, + ) + + try: + yield BasePlot._fig.figure + finally: + BasePlot._fig.is_canvas = False # Close the canvas + self._plot( + groupclick="togglegroup", + title=title, + legend=legend, + figsize=figsize or (550 + 350 * cols, 200 + 400 * rows), + plotname="canvas", + filename=filename, + display=display, + ) + + def reset_aesthetics(self): + """Reset the plot [aesthetics][] to their default values.""" + self._custom_layout = {} + self._custom_traces = {} + self._aesthetics = Aesthetics( + palette=PALETTE, + title_fontsize=24, + label_fontsize=16, + tick_fontsize=12, + line_width=2, + marker_size=8, + ) + + def update_layout(self, **kwargs): + """Update the properties of the plot's layout. + + Recursively update the structure of the original layout with + the values in the arguments. + + Parameters + ---------- + **kwargs + Keyword arguments for the figure's [update_layout][] method. + + """ + self._custom_layout = kwargs + + def update_traces(self, **kwargs): + """Update the properties of the plot's traces. + + Recursively update the structure of the original traces with + the values in the arguments. + + Parameters + ---------- + **kwargs + Keyword arguments for the figure's [update_traces][] method. + + """ + self._custom_traces = kwargs diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py new file mode 100644 index 000000000..105e7cd6d --- /dev/null +++ b/atom/plots/dataplot.py @@ -0,0 +1,985 @@ +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module containing the DataPlot class. + +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from nltk.collocations import ( + BigramCollocationFinder, QuadgramCollocationFinder, + TrigramCollocationFinder, +) +from scipy import stats +from typeguard import typechecked + +from atom.plots.base import BasePlot +from atom.utils.constants import PALETTE +from atom.utils.types import INT, LEGEND, SEQUENCE, SERIES, SLICE +from atom.utils.utils import ( + check_dependency, crash, divide, get_corpus, lst, rnd, +) + + +@typechecked +class DataPlot(BasePlot): + """Data plots. + + Plots used for understanding and interpretation of the dataset. + They are only accessible from atom since. The other runners should + be used for model training only, not for data manipulation. + + """ + + @crash + def plot_correlation( + self, + columns: slice | SEQUENCE | None = None, + method: str = "pearson", + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] = (800, 700), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot a correlation matrix. + + Displays a heatmap showing the correlation between columns in + the dataset. The colors red, blue and white stand for positive, + negative, and no correlation respectively. + + Parameters + ---------- + columns: slice, sequence or None, default=None + Columns to plot. If None, plot all columns in the dataset. + Selected categorical columns are ignored. + + method: str, default="pearson" + Method of correlation. Choose from: pearson, kendall or + spearman. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple, default=(800, 700) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:DataPlot.plot_distribution + atom.plots:DataPlot.plot_qq + atom.plots:DataPlot.plot_relationships + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.plot_correlation() + ``` + + """ + columns = self.branch._get_columns(columns, only_numerical=True) + if method.lower() not in ("pearson", "kendall", "spearman"): + raise ValueError( + f"Invalid value for the method parameter, got {method}. " + "Choose from: pearson, kendall or spearman." + ) + + # Compute the correlation matrix + corr = self.dataset[columns].corr(method=method.lower()) + + # Generate a mask for the lower triangle + # k=1 means keep outermost diagonal line + mask = np.zeros_like(corr, dtype=bool) + mask[np.triu_indices_from(mask, k=1)] = True + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes( + x=(0, 0.87), + coloraxis=dict( + colorscale="rdbu_r", + cmin=-1, + cmax=1, + title=f"{method.lower()} correlation", + font_size=self.label_fontsize, + ), + ) + + fig.add_trace( + go.Heatmap( + z=corr.mask(mask), + x=columns, + y=columns, + coloraxis=f"coloraxis{xaxis[1:]}", + hovertemplate="x:%{x}
y:%{y}
z:%{z}", + hoverongaps=False, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout( + { + "template": "plotly_white", + f"yaxis{yaxis[1:]}_autorange": "reversed", + f"xaxis{xaxis[1:]}_showgrid": False, + f"yaxis{yaxis[1:]}_showgrid": False, + } + ) + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + title=title, + legend=legend, + figsize=figsize, + plotname="plot_correlation", + filename=filename, + display=display, + ) + + @crash + def plot_distribution( + self, + columns: SLICE = 0, + distributions: str | SEQUENCE | None = None, + show: INT | None = None, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper right", + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot column distributions. + + - For numerical columns, plot the probability density + distribution. Additionally, it's possible to plot any of + `scipy.stats` distributions fitted to the column. + - For categorical columns, plot the class distribution. + Only one categorical column can be plotted at the same time. + + !!! tip + Use atom's [distribution][atomclassifier-distribution] + method to check which distribution fits the column best. + + Parameters + ---------- + columns: int, str, slice or sequence, default=0 + Columns to plot. I's only possible to plot one categorical + column. If more than one categorical columns are selected, + all categorical columns are ignored. + + distributions: str, sequence or None, default=None + Names of the `scipy.stats` distributions to fit to the + columns. If None, a [Gaussian kde distribution][kde] is + showed. Only for numerical columns. + + show: int or None, default=None + Number of classes (ordered by number of occurrences) to + show in the plot. If None, it shows all classes. Only for + categorical columns. + + title: str, dict or None, default=None + Title for the plot. + + - If None: No title is shown. + - If str: Text for the title. + - If dict: [title configuration][parameters]. + + legend: str, dict or None, default="upper right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the plot's type. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:DataPlot.plot_correlation + atom.plots:DataPlot.plot_qq + atom.plots:DataPlot.plot_relationships + + Examples + -------- + ```pycon + import numpy as np + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + # Add a categorical feature + animals = ["cat", "dog", "bird", "lion", "zebra"] + probabilities = [0.001, 0.1, 0.2, 0.3, 0.399] + X["animals"] = np.random.choice(animals, size=len(X), p=probabilities) + + atom = ATOMClassifier(X, y, random_state=1) + atom.plot_distribution(columns=[0, 1]) + atom.plot_distribution(columns=0, distributions=["norm", "invgauss"]) + atom.plot_distribution(columns="animals") + ``` + + """ + columns = self.branch._get_columns(columns) + cat_columns = list(self.dataset.select_dtypes(exclude="number").columns) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + if len(columns) == 1 and columns[0] in cat_columns: + series = self.dataset[columns[0]].value_counts(ascending=True) + + if show is None or show > len(series): + show = len(series) + elif show < 1: + raise ValueError( + "Invalid value for the show parameter." + f"Value should be >0, got {show}." + ) + + color = BasePlot._fig.get_elem() + fig.add_trace( + go.Bar( + x=series, + y=series.index, + orientation="h", + marker=dict( + color=f"rgba({color[4:-1]}, 0.2)", + line=dict(width=2, color=color), + ), + hovertemplate="%{x}", + name=f"{columns[0]}: {len(series)} classes", + showlegend=BasePlot._fig.showlegend("dist", legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Counts", + ylim=(len(series) - show - 0.5, len(series) - 0.5), + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + plotname="plot_distribution", + filename=filename, + display=display, + ) + + else: + for col in [c for c in columns if c not in cat_columns]: + fig.add_trace( + go.Histogram( + x=self.dataset[col], + histnorm="probability density", + marker=dict( + color=f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)", + line=dict(width=2, color=BasePlot._fig.get_elem(col)), + ), + nbinsx=40, + name="dist", + legendgroup=col, + legendgrouptitle=dict(text=col, font_size=self.label_fontsize), + showlegend=BasePlot._fig.showlegend(f"{col}-dist", legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + x = np.linspace(self.dataset[col].min(), self.dataset[col].max(), 200) + + # Drop missing values for compatibility with scipy.stats + missing = self.missing + [np.inf, -np.inf] + values = self.dataset[col].replace(missing, np.NaN).dropna() + + if distributions: + # Get a line for each distribution + for j, dist in enumerate(lst(distributions)): + params = getattr(stats, dist).fit(values) + + fig.add_trace( + self._draw_line( + x=x, + y=getattr(stats, dist).pdf(x, *params), + parent=col, + child=dist, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + else: + # If no distributions specified, draw Gaussian kde + fig.add_trace( + self._draw_line( + x=x, + y=stats.gaussian_kde(values)(x), + parent=col, + child="kde", + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout(dict(barmode="overlay")) + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Values", + ylabel="Probability density", + title=title, + legend=legend, + figsize=figsize or (900, 600), + plotname="plot_distribution", + filename=filename, + display=display, + ) + + @crash + def plot_ngrams( + self, + ngram: INT | str = "bigram", + index: SLICE | None = None, + show: INT = 10, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot n-gram frequencies. + + The text for the plot is extracted from the column named + `corpus`. If there is no column with that name, an exception + is raised. If the documents are not tokenized, the words are + separated by spaces. + + !!! tip + Use atom's [tokenize][atomclassifier-tokenize] method to + separate the words creating n-grams based on their frequency + in the corpus. + + Parameters + ---------- + ngram: str or int, default="bigram" + Number of contiguous words to search for (size of n-gram). + Choose from: words (1), bigrams (2), trigrams (3), + quadgrams (4). + + index: int, str, slice, sequence or None, default=None + Documents in the corpus to include in the search. If None, + it selects all documents in the dataset. + + show: int, default=10 + Number of n-grams (ordered by number of occurrences) to + show in the plot. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of n-grams shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:DataPlot.plot_wordcloud + + Examples + -------- + ```pycon + import numpy as np + from atom import ATOMClassifier + from sklearn.datasets import fetch_20newsgroups + + X, y = fetch_20newsgroups( + return_X_y=True, + categories=["alt.atheism", "sci.med", "comp.windows.x"], + shuffle=True, + random_state=1, + ) + X = np.array(X).reshape(-1, 1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.textclean() + atom.textnormalize() + atom.plot_ngrams() + ``` + + """ + + def get_text(column: SERIES) -> SERIES: + """Get the complete corpus as sequence of tokens. + + Parameters + ---------- + column: series + Column containing the corpus. + + Returns + ------- + series + Corpus of tokens. + + """ + if isinstance(column.iat[0], str): + return column.apply(lambda row: row.split()) + else: + return column + + corpus = get_corpus(self.X) + rows = self.dataset.loc[self.branch._get_rows(index, return_test=False)] + + if str(ngram).lower() in ("1", "word", "words"): + ngram = "words" + series = pd.Series( + [word for row in get_text(rows[corpus]) for word in row] + ).value_counts(ascending=True) + else: + if str(ngram).lower() in ("2", "bigram", "bigrams"): + ngram, finder = "bigrams", BigramCollocationFinder + elif str(ngram).lower() in ("3", "trigram", "trigrams"): + ngram, finder = "trigrams", TrigramCollocationFinder + elif str(ngram).lower() in ("4", "quadgram", "quadgrams"): + ngram, finder = "quadgrams", QuadgramCollocationFinder + else: + raise ValueError( + f"Invalid value for the ngram parameter, got {ngram}. " + "Choose from: words, bigram, trigram, quadgram." + ) + + ngram_fd = finder.from_documents(get_text(rows[corpus])).ngram_fd + series = pd.Series( + data=[x[1] for x in ngram_fd.items()], + index=[" ".join(x[0]) for x in ngram_fd.items()], + ).sort_values(ascending=True) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + fig.add_trace( + go.Bar( + x=(data := series[-show:]), + y=data.index, + orientation="h", + marker=dict( + color=f"rgba({BasePlot._fig.get_elem(ngram)[4:-1]}, 0.2)", + line=dict(width=2, color=BasePlot._fig.get_elem(ngram)), + ), + hovertemplate="%{x}", + name=f"Total {ngram}: {len(series)}", + legendgroup=ngram, + showlegend=BasePlot._fig.showlegend(ngram, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Counts", + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + plotname="plot_ngrams", + filename=filename, + display=display, + ) + + @crash + def plot_qq( + self, + columns: SLICE = 0, + distributions: str | SEQUENCE = "norm", + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot a quantile-quantile plot. + + Columns are distinguished by color and the distributions are + distinguished by marker type. Missing values are ignored. + + Parameters + ---------- + columns: int, str, slice or sequence, default=0 + Columns to plot. Selected categorical columns are ignored. + + distributions: str or sequence, default="norm" + Names of the `scipy.stats` distributions to fit to the + columns. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:DataPlot.plot_correlation + atom.plots:DataPlot.plot_distribution + atom.plots:DataPlot.plot_relationships + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.plot_qq(columns=[5, 6]) + atom.plot_qq(columns=0, distributions=["norm", "invgauss", "triang"]) + ``` + + """ + columns = self.branch._get_columns(columns) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + percentiles = np.linspace(0, 100, 101) + for col in columns: + # Drop missing values for compatibility with scipy.stats + missing = self.missing + [np.inf, -np.inf] + values = self.dataset[col].replace(missing, np.NaN).dropna() + + for dist in lst(distributions): + stat = getattr(stats, dist) + params = stat.fit(values) + samples = stat.rvs(*params, size=101, random_state=self.random_state) + + fig.add_trace( + self._draw_line( + x=np.percentile(samples, percentiles), + y=np.percentile(values, percentiles), + mode="markers", + parent=col, + child=dist, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis) + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Theoretical quantiles", + ylabel="Observed quantiles", + title=title, + legend=legend, + figsize=figsize or (900, 600), + plotname="plot_qq", + filename=filename, + display=display, + ) + + @crash + def plot_relationships( + self, + columns: slice | SEQUENCE = (0, 1, 2), + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] = (900, 900), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot pairwise relationships in a dataset. + + Creates a grid of axes such that each numerical column appears + once on the x-axes and once on the y-axes. The bottom triangle + contains scatter plots (max 250 random samples), the diagonal + plots contain column distributions, and the upper triangle + contains contour histograms for all samples in the columns. + + Parameters + ---------- + columns: slice or sequence, default=(0, 1, 2) + Columns to plot. Selected categorical columns are ignored. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple, default=(900, 900) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:DataPlot.plot_correlation + atom.plots:DataPlot.plot_distribution + atom.plots:DataPlot.plot_qq + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.plot_relationships(columns=[0, 4, 5]) + ``` + + """ + columns = self.branch._get_columns(columns, only_numerical=True) + + # Use max 250 samples to not clutter the plot + sample = lambda col: self.dataset[col].sample( + n=min(len(self.dataset), 250), random_state=self.random_state + ) + + fig = self._get_figure() + color = BasePlot._fig.get_elem() + for i in range(len(columns)**2): + x, y = i // len(columns), i % len(columns) + + # Calculate the distance between subplots + offset = divide(0.0125, (len(columns) - 1)) + + # Calculate the size of the subplot + size = (1 - ((offset * 2) * (len(columns) - 1))) / len(columns) + + # Determine the position for the axes + x_pos = y * (size + 2 * offset) + y_pos = (len(columns) - x - 1) * (size + 2 * offset) + + xaxis, yaxis = BasePlot._fig.get_axes( + x=(x_pos, rnd(x_pos + size)), + y=(y_pos, rnd(y_pos + size)), + coloraxis=dict( + colorscale=PALETTE.get(color, "Blues"), + cmin=0, + cmax=len(self.dataset), + showscale=False, + ) + ) + + if x == y: + fig.add_trace( + go.Histogram( + x=self.dataset[columns[x]], + marker=dict( + color=f"rgba({color[4:-1]}, 0.2)", + line=dict(width=2, color=color), + ), + name=columns[x], + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + elif x > y: + fig.add_trace( + go.Scatter( + x=sample(columns[y]), + y=sample(columns[x]), + mode="markers", + marker=dict(color=color), + hovertemplate="(%{x}, %{y})", + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + elif y > x: + fig.add_trace( + go.Histogram2dContour( + x=self.dataset[columns[y]], + y=self.dataset[columns[x]], + coloraxis=f"coloraxis{xaxis[1:]}", + hovertemplate="x:%{x}
y:%{y}
z:%{z}", + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + if x < len(columns) - 1: + fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False}) + if y > 0: + fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False}) + + self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel=columns[y] if x == len(columns) - 1 else None, + ylabel=columns[x] if y == 0 else None, + ) + + return self._plot( + title=title, + legend=legend, + figsize=figsize or (900, 900), + plotname="plot_relationships", + filename=filename, + display=display, + ) + + @crash + def plot_wordcloud( + self, + index: SLICE | None = None, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + **kwargs, + ) -> go.Figure | None: + """Plot a wordcloud from the corpus. + + The text for the plot is extracted from the column named + `corpus`. If there is no column with that name, an exception + is raised. + + Parameters + ---------- + index: int, str, slice, sequence or None, default=None + Documents in the corpus to include in the wordcloud. If + None, it selects all documents in the dataset. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + **kwargs + Additional keyword arguments for the [Wordcloud][] object. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:DataPlot.plot_ngrams + atom.plots:PredictionPlot.plot_pipeline + + Examples + -------- + ```pycon + import numpy as np + from atom import ATOMClassifier + from sklearn.datasets import fetch_20newsgroups + + X, y = fetch_20newsgroups( + return_X_y=True, + categories=["alt.atheism", "sci.med", "comp.windows.x"], + shuffle=True, + random_state=1, + ) + X = np.array(X).reshape(-1, 1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.textclean() + atom.textnormalize() + atom.plot_wordcloud() + ``` + + """ + + def get_text(column): + """Get the complete corpus as one long string.""" + if isinstance(column.iat[0], str): + return " ".join(column) + else: + return " ".join([" ".join(row) for row in column]) + + check_dependency("wordcloud") + from wordcloud import WordCloud + + corpus = get_corpus(self.X) + rows = self.dataset.loc[self.branch._get_rows(index, return_test=False)] + + wordcloud = WordCloud( + width=figsize[0], + height=figsize[1], + background_color=kwargs.pop("background_color", "white"), + random_state=kwargs.pop("random_state", self.random_state), + **kwargs, + ) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + fig.add_trace( + go.Image( + z=wordcloud.generate(get_text(rows[corpus])), + hoverinfo="skip", + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout( + { + f"xaxis{xaxis[1:]}_showticklabels": False, + f"yaxis{xaxis[1:]}_showticklabels": False, + } + ) + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + title=title, + legend=legend, + figsize=figsize or (900, 600), + plotname="plot_wordcloud", + filename=filename, + display=display, + ) diff --git a/atom/plots/featureselectionplot.py b/atom/plots/featureselectionplot.py new file mode 100644 index 000000000..79f83e1f3 --- /dev/null +++ b/atom/plots/featureselectionplot.py @@ -0,0 +1,428 @@ +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module containing the FeatureSelectionPlot class. + +""" + +from __future__ import annotations + +import numpy as np +import plotly.graph_objects as go +from sklearn.utils.metaestimators import available_if +from typeguard import typechecked + +from atom.plots.base import BasePlot +from atom.utils.types import INT, LEGEND +from atom.utils.utils import crash, has_attr + + +@typechecked +class FeatureSelectionPlot(BasePlot): + """Feature selection plots. + + These plots are accessible from atom or from the FeatureSelector + class when the appropriate feature selection strategy is used. + + """ + + @available_if(has_attr("pca")) + @crash + def plot_components( + self, + show: INT | None = None, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the explained variance ratio per component. + + Kept components are colored and discarted components are + transparent. This plot is available only when feature selection + was applied with strategy="pca". + + Parameters + ---------- + show: int or None, default=None + Number of components to show. None to show all. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of components shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:FeatureSelectionPlot.plot_pca + atom.plots:FeatureSelectionPlot.plot_rfecv + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.feature_selection("pca", n_features=5) + atom.plot_components(show=10) + ``` + + """ + if show is None or show > self.pca.components_.shape[0]: + # Limit max features shown to avoid maximum figsize error + show = min(200, self.pca.components_.shape[0]) + elif show < 1: + raise ValueError( + "Invalid value for the show parameter. " + f"Value should be >0, got {show}." + ) + + # Get the variance ratio per component + variance = np.array(self.pca.explained_variance_ratio_) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + # Create color scheme: first normal and then fully transparent + color = BasePlot._fig.get_elem("components") + opacity = [0.2] * self.pca._comps + [0] * (len(variance) - self.pca._comps) + + fig.add_trace( + go.Bar( + x=variance, + y=[f"pca{str(i)}" for i in range(len(variance))], + orientation="h", + marker=dict( + color=[f"rgba({color[4:-1]}, {o})" for o in opacity], + line=dict(width=2, color=color), + ), + hovertemplate="%{x}", + name=f"Variance retained: {variance[:self.pca._comps].sum():.3f}", + legendgroup="components", + showlegend=BasePlot._fig.showlegend("components", legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout({f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending")}) + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Explained variance ratio", + ylim=(len(variance) - show - 0.5, len(variance) - 0.5), + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + plotname="plot_components", + filename=filename, + display=display, + ) + + @available_if(has_attr("pca")) + @crash + def plot_pca( + self, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the explained variance ratio vs number of components. + + If the underlying estimator is [PCA][] (for dense datasets), + all possible components are plotted. If the underlying estimator + is [TruncatedSVD][] (for sparse datasets), it only shows the + selected components. The star marks the number of components + selected by the user. This plot is available only when feature + selection was applied with strategy="pca". + + Parameters + ---------- + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:FeatureSelectionPlot.plot_components + atom.plots:FeatureSelectionPlot.plot_rfecv + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.feature_selection("pca", n_features=5) + atom.plot_pca() + ``` + + """ + # Create star symbol at selected number of components + symbols = ["circle"] * self.pca.n_features_in_ + symbols[self.pca._comps - 1] = "star" + sizes = [self.marker_size] * self.pca.n_features_in_ + sizes[self.pca._comps - 1] = self.marker_size * 1.5 + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + fig.add_trace( + go.Scatter( + x=tuple(range(1, self.pca.n_features_in_ + 1)), + y=np.cumsum(self.pca.explained_variance_ratio_), + mode="lines+markers", + line=dict(width=self.line_width, color=BasePlot._fig.get_elem("pca")), + marker=dict( + symbol=symbols, + size=sizes, + line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), + opacity=1, + ), + hovertemplate="%{y}", + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout( + { + "hovermode": "x", + f"xaxis{xaxis[1:]}_showspikes": True, + f"yaxis{yaxis[1:]}_showspikes": True, + } + ) + + margin = self.pca.n_features_in_ / 30 + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="First N principal components", + ylabel="Cumulative variance ratio", + xlim=(1 - margin, self.pca.n_features_in_ - 1 + margin), + title=title, + legend=legend, + figsize=figsize, + plotname="plot_pca", + filename=filename, + display=display, + ) + + @available_if(has_attr("rfecv")) + @crash + def plot_rfecv( + self, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the rfecv results. + + Plot the scores obtained by the estimator fitted on every + subset of the dataset. Only available when feature selection + was applied with strategy="rfecv". + + Parameters + ---------- + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:FeatureSelectionPlot.plot_components + atom.plots:FeatureSelectionPlot.plot_pca + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.feature_selection("rfecv", solver="Tree") + atom.plot_rfecv() + ``` + + """ + try: # Define the y-label for the plot + ylabel = self.rfecv.get_params()["scoring"].name + except AttributeError: + ylabel = "accuracy" if self.goal.startswith("class") else "r2" + + x = range(self.rfecv.min_features_to_select, self.rfecv.n_features_in_ + 1) + + # Create star symbol at selected number of features + sizes = [6] * len(x) + sizes[self.rfecv.n_features_ - self.rfecv.min_features_to_select] = 12 + symbols = ["circle"] * len(x) + symbols[self.rfecv.n_features_ - self.rfecv.min_features_to_select] = "star" + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + mean = self.rfecv.cv_results_["mean_test_score"] + std = self.rfecv.cv_results_["std_test_score"] + + fig.add_trace( + go.Scatter( + x=list(x), + y=mean, + mode="lines+markers", + line=dict(width=self.line_width, color=BasePlot._fig.get_elem("rfecv")), + marker=dict( + symbol=symbols, + size=sizes, + line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), + opacity=1, + ), + name=ylabel, + legendgroup="rfecv", + showlegend=BasePlot._fig.showlegend("rfecv", legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + # Add error bands + fig.add_traces( + [ + go.Scatter( + x=tuple(x), + y=mean + std, + mode="lines", + line=dict(width=1, color=BasePlot._fig.get_elem("rfecv")), + hovertemplate="%{y}upper bound", + legendgroup="rfecv", + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + go.Scatter( + x=tuple(x), + y=mean - std, + mode="lines", + line=dict(width=1, color=BasePlot._fig.get_elem("rfecv")), + fill="tonexty", + fillcolor=f"rgba{BasePlot._fig.get_elem('rfecv')[3:-1]}, 0.2)", + hovertemplate="%{y}lower bound", + legendgroup="rfecv", + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + ] + ) + + fig.update_layout({"hovermode": "x unified"}) + + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + groupclick="togglegroup", + xlabel="Number of features", + ylabel=ylabel, + xlim=(min(x) - len(x) / 30, max(x) + len(x) / 30), + ylim=(min(mean) - 3 * max(std), max(mean) + 3 * max(std)), + title=title, + legend=legend, + figsize=figsize, + plotname="plot_rfecv", + filename=filename, + display=display, + ) diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py new file mode 100644 index 000000000..08f09893a --- /dev/null +++ b/atom/plots/hyperparametertuningplot.py @@ -0,0 +1,1453 @@ +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module containing the HyperparameterTuningPlot class. + +""" + +from __future__ import annotations + +from datetime import datetime + +import numpy as np +import plotly.graph_objects as go +from optuna.importance import FanovaImportanceEvaluator +from optuna.trial import TrialState +from optuna.visualization._parallel_coordinate import ( + _get_dims_from_info, _get_parallel_coordinate_info, +) +from optuna.visualization._terminator_improvement import _get_improvement_info +from optuna.visualization._utils import _is_log_scale +from sklearn.utils._bunch import Bunch +from typeguard import typechecked + +from atom.plots.base import BasePlot +from atom.utils.constants import PALETTE +from atom.utils.types import INT, INT_TYPES, LEGEND, MODEL, SEQUENCE +from atom.utils.utils import ( + check_dependency, check_hyperparams, composed, crash, divide, it, lst, + plot_from_model, rnd, +) + + +@typechecked +class HyperparameterTuningPlot(BasePlot): + """Hyperparameter tuning plots. + + Plots that help interpret the model's study and corresponding + trials. These plots are accessible from the runners or from the + models. If called from a runner, the `models` parameter has to be + specified (if None, uses all models). If called from a model, that + model is used and the `models` parameter becomes unavailable. + + """ + + @composed(crash, plot_from_model) + def plot_edf( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + metric: INT | str | SEQUENCE | None = None, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper left", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the Empirical Distribution Function of a study. + + Use this plot to analyze and improve hyperparameter search + spaces. The EDF assumes that the value of the objective + function is in accordance with the uniform distribution over + the objective space. This plot is only available for models + that ran [hyperparameter tuning][]. + + !!! note + Only complete trials are considered when plotting the EDF. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models that used hyperparameter + tuning are selected. + + metric: int, str, sequence or None, default=None + Metric to plot (only for multi-metric runs). If str, add `+` + between options to select more than one. If None, the metric + used to run the pipeline is selected. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper left" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:HyperparameterTuningPlot.plot_hyperparameters + atom.plots:HyperparameterTuningPlot.plot_trials + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from optuna.distributions import IntDistribution + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + + # Run three models with different search spaces + atom.run( + models="RF_1", + n_trials=10, + ht_params={"distributions": {"n_estimators": IntDistribution(6, 10)}}, + ) + atom.run( + models="RF_2", + n_trials=10, + ht_params={"distributions": {"n_estimators": IntDistribution(11, 15)}}, + ) + atom.run( + models="RF_3", + n_trials=10, + ht_params={"distributions": {"n_estimators": IntDistribution(16, 20)}}, + ) + + atom.plot_edf() + ``` + + """ + models = check_hyperparams(models, "plot_edf") + metric = self._get_metric(metric, max_one=False) + + values = [] + for m in models: + values.append([]) + for met in metric: + values[-1].append(np.array([lst(row)[met] for row in m.trials["score"]])) + + x_min = np.nanmin(np.array(values)) + x_max = np.nanmax(np.array(values)) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m, val in zip(models, values): + for met in metric: + fig.add_trace( + self._draw_line( + x=(x := np.linspace(x_min, x_max, 100)), + y=np.sum(val[met][:, np.newaxis] <= x, axis=0) / len(val[met]), + parent=m.name, + child=self._metric[met].name, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + ylim=(0, 1), + xlabel="Score", + ylabel="Cumulative Probability", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_edf", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model) + def plot_hyperparameter_importance( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + metric: int | str = 0, + show: INT | None = None, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot a model's hyperparameter importance. + + The hyperparameter importance are calculated using the + [fANOVA][] importance evaluator. The sum of importances for all + parameters (per model) is 1. This plot is only available for + models that ran [hyperparameter tuning][]. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models that used hyperparameter + tuning are selected. + + metric: int or str, default=0 + Metric to plot (only for multi-metric runs). + + show: int or None, default=None + Number of hyperparameters (ordered by importance) to show. + None to show all. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of hyperparameters shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_feature_importance + atom.plots:HyperparameterTuningPlot.plot_hyperparameters + atom.plots:HyperparameterTuningPlot.plot_trials + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["ET", "RF"], n_trials=10) + atom.plot_hyperparameter_importance() + ``` + + """ + models = check_hyperparams(models, "plot_hyperparameter_importance") + params = len(set([k for m in lst(models) for k in m._ht["distributions"]])) + met = self._get_metric(metric, max_one=True) + + if show is None or show > params: + # Limit max features shown to avoid maximum figsize error + show = min(200, params) + elif show < 1: + raise ValueError( + f"Invalid value for the show parameter. Value should be >0, got {show}." + ) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + importances = FanovaImportanceEvaluator(seed=self.random_state).evaluate( + study=m.study, + target=None if len(self._metric) == 1 else lambda x: x.values[met], + ) + + fig.add_trace( + go.Bar( + x=np.array(list(importances.values())) / sum(importances.values()), + y=list(importances.keys()), + orientation="h", + marker=dict( + color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", + line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), + ), + hovertemplate="%{x}", + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout( + { + f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"), + "bargroupgap": 0.05, + } + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Normalized hyperparameter importance", + ylim=(params - show - 0.5, params - 0.5), + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + plotname="plot_hyperparameter_importance", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(max_one=True)) + def plot_hyperparameters( + self, + models: INT | str | MODEL | None = None, + params: str | slice | SEQUENCE = (0, 1), + metric: int | str = 0, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot hyperparameter relationships in a study. + + A model's hyperparameters are plotted against each other. The + corresponding metric scores are displayed in a contour plot. + The markers are the trials in the study. This plot is only + available for models that ran [hyperparameter tuning][]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_hyperparameters()`. + + params: str, slice or sequence, default=(0, 1) + Hyperparameters to plot. Use a sequence or add `+` between + options to select more than one. + + metric: int or str, default=0 + Metric to plot (only for multi-metric runs). + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of hyperparameters shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:HyperparameterTuningPlot.plot_hyperparameter_importance + atom.plots:HyperparameterTuningPlot.plot_parallel_coordinate + atom.plots:HyperparameterTuningPlot.plot_trials + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run("LR", n_trials=15) + atom.plot_hyperparameters(params=(0, 1, 2)) + ``` + + """ + m = check_hyperparams(models, "plot_hyperparameters")[0] + + if len(params := self._get_hyperparams(params, models)) < 2: + raise ValueError( + "Invalid value for the hyperparameters parameter. A minimum " + f"of two parameters is required, got {len(params)}." + ) + + met = self._get_metric(metric, max_one=True) + + fig = self._get_figure() + for i in range((length := len(params) - 1) ** 2): + x, y = i // length, i % length + + if y <= x: + # Calculate the size of the subplot + size = 1 / length + + # Determine the position for the axes + x_pos = y * size + y_pos = (length - x - 1) * size + + xaxis, yaxis = BasePlot._fig.get_axes( + x=(x_pos, rnd(x_pos + size)), + y=(y_pos, rnd(y_pos + size)), + coloraxis=dict( + axes="99", + colorscale=PALETTE.get(BasePlot._fig.get_elem(m.name), "Blues"), + cmin=np.nanmin( + m.trials.apply(lambda x: lst(x["score"])[met], axis=1) + ), + cmax=np.nanmax( + m.trials.apply(lambda x: lst(x["score"])[met], axis=1) + ), + showscale=False, + ) + ) + + x_values = lambda row: row["params"].get(params[y], None) + y_values = lambda row: row["params"].get(params[x + 1], None) + + fig.add_trace( + go.Scatter( + x=m.trials.apply(x_values, axis=1), + y=m.trials.apply(y_values, axis=1), + mode="markers", + marker=dict( + size=self.marker_size, + color=BasePlot._fig.get_elem(m.name), + line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), + ), + customdata=list( + zip( + m.trials.index.tolist(), + m.trials.apply(lambda x: lst(x["score"])[met], axis=1), + ) + ), + hovertemplate=( + f"{params[y]}:%{{x}}
" + f"{params[x + 1]}:%{{y}}
" + f"{self._metric[met].name}:%{{customdata[1]:.4f}}" + "Trial %{customdata[0]}" + ), + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.add_trace( + go.Contour( + x=m.trials.apply(x_values, axis=1), + y=m.trials.apply(y_values, axis=1), + z=m.trials.apply(lambda i: lst(i["score"])[met], axis=1), + contours=dict( + showlabels=True, + labelfont=dict(size=self.tick_fontsize, color="white") + ), + coloraxis="coloraxis99", + hoverinfo="skip", + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + if _is_log_scale(m.study.trials, params[y]): + fig.update_layout({f"xaxis{xaxis[1:]}_type": "log"}) + if _is_log_scale(m.study.trials, params[x + 1]): + fig.update_layout({f"yaxis{xaxis[1:]}_type": "log"}) + + if x < length - 1: + fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False}) + if y > 0: + fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False}) + + fig.update_layout( + { + "template": "plotly_white", + f"xaxis{xaxis[1:]}_showgrid": False, + f"yaxis{yaxis[1:]}_showgrid": False, + f"xaxis{yaxis[1:]}_zeroline": False, + f"yaxis{yaxis[1:]}_zeroline": False, + } + ) + + self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel=params[y] if x == length - 1 else None, + ylabel=params[x + 1] if y == 0 else None, + ) + + BasePlot._fig.used_models.append(m) + return self._plot( + title=title, + legend=legend, + figsize=figsize or (800 + 100 * length, 500 + 100 * length), + plotname="plot_hyperparameters", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(max_one=True)) + def plot_parallel_coordinate( + self, + models: INT | str | MODEL | None = None, + params: str | slice | SEQUENCE | None = None, + metric: INT | str = 0, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot high-dimensional parameter relationships in a study. + + Every line of the plot represents one trial. This plot is only + available for models that ran [hyperparameter tuning][]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_parallel_coordinate()`. + + params: str, slice, sequence or None, default=None + Hyperparameters to plot. Use a sequence or add `+` between + options to select more than one. If None, all the model's + hyperparameters are selected. + + metric: int or str, default=0 + Metric to plot (only for multi-metric runs). + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of hyperparameters shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:HyperparameterTuningPlot.plot_edf + atom.plots:HyperparameterTuningPlot.plot_hyperparameter_importance + atom.plots:HyperparameterTuningPlot.plot_hyperparameters + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run("RF", n_trials=15) + atom.plot_parallel_coordinate(params=slice(1, 5)) + ``` + + """ + + def sort_mixed_types(values: list[str]) -> list[str]: + """Sort a sequence of numbers and strings. + + Numbers are converted and take precedence over strings. + + Parameters + ---------- + values: list + Values to sort. + + Returns + ------- + list of str + Sorted values. + + """ + numbers, categorical = [], [] + for elem in values: + try: + numbers.append(it(float(elem))) + except (TypeError, ValueError): + categorical.append(str(elem)) + + return list(map(str, sorted(numbers))) + sorted(categorical) + + m = check_hyperparams(models, "plot_parallel_coordinate")[0] + params = self._get_hyperparams(params, models) + met = self._get_metric(metric, max_one=True) + + dims = _get_dims_from_info( + _get_parallel_coordinate_info( + study=m.study, + params=params, + target=None if len(self._metric) == 1 else lambda x: x.values[met], + target_name=self._metric[met].name, + ) + ) + + # Clean up dimensions for nicer view + for d in [dims[0]] + sorted(dims[1:], key=lambda x: params.index(x["label"])): + if "ticktext" in d: + # Skip processing for logarithmic params + if all(isinstance(i, INT_TYPES) for i in d["values"]): + # Order categorical values + mapping = [d["ticktext"][i] for i in d["values"]] + d["ticktext"] = sort_mixed_types(d["ticktext"]) + d["values"] = [d["ticktext"].index(v) for v in mapping] + else: + # Round numerical values + d["tickvals"] = list( + map(rnd, np.linspace(min(d["values"]), max(d["values"]), 5)) + ) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes( + coloraxis=dict( + colorscale=PALETTE.get(BasePlot._fig.get_elem(m.name), "Blues"), + cmin=min(dims[0]["values"]), + cmax=max(dims[0]["values"]), + title=self._metric[met].name, + font_size=self.label_fontsize, + ) + ) + + fig.add_trace( + go.Parcoords( + dimensions=dims, + line=dict( + color=dims[0]["values"], + coloraxis=f"coloraxis{xaxis[1:]}", + ), + unselected=dict(line=dict(color="gray", opacity=0.5)), + labelside="bottom", + labelfont=dict(size=self.label_fontsize), + ) + ) + + BasePlot._fig.used_models.append(m) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + title=title, + legend=legend, + figsize=figsize or (700 + len(params) * 50, 600), + plotname="plot_parallel_coordinate", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(max_one=True)) + def plot_pareto_front( + self, + models: INT | str | MODEL | None = None, + metric: str | SEQUENCE | None = None, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the Pareto front of a study. + + Shows the trial scores plotted against each other. The marker's + colors indicate the trial number. This plot is only available + for models that ran [multi-metric runs][] with + [hyperparameter tuning][]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_pareto_front()`. + + metric: str, sequence or None, default=None + Metrics to plot. Use a sequence or add `+` between options + to select more than one. If None, the metrics used to run + the pipeline are selected. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of metrics shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:HyperparameterTuningPlot.plot_edf + atom.plots:HyperparameterTuningPlot.plot_slice + atom.plots:HyperparameterTuningPlot.plot_trials + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run( + models="RF", + metric=["f1", "accuracy", "recall"], + n_trials=15, + ) + atom.plot_pareto_front() + ``` + + """ + m = check_hyperparams(models, "plot_pareto_front")[0] + + if len(metric := self._get_metric(metric, max_one=False)) < 2: + raise ValueError( + "Invalid value for the metric parameter. A minimum " + f"of two metrics are required, got {len(metric)}." + ) + + fig = self._get_figure() + for i in range((length := len(metric) - 1) ** 2): + x, y = i // length, i % length + + if y <= x: + # Calculate the distance between subplots + offset = divide(0.0125, length - 1) + + # Calculate the size of the subplot + size = (1 - ((offset * 2) * (length - 1))) / length + + # Determine the position for the axes + x_pos = y * (size + 2 * offset) + y_pos = (length - x - 1) * (size + 2 * offset) + + xaxis, yaxis = BasePlot._fig.get_axes( + x=(x_pos, rnd(x_pos + size)), + y=(y_pos, rnd(y_pos + size)), + ) + + fig.add_trace( + go.Scatter( + x=m.trials.apply(lambda row: row["score"][y], axis=1), + y=m.trials.apply(lambda row: row["score"][x + 1], axis=1), + mode="markers", + marker=dict( + size=self.marker_size, + color=m.trials.index, + colorscale="Teal", + line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), + ), + customdata=m.trials.index, + hovertemplate="(%{x}, %{y})Trial %{customdata}", + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + if x < len(metric) - 1: + fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False}) + if y > 0: + fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False}) + + self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel=self._metric[y].name if x == length - 1 else None, + ylabel=self._metric[x + 1].name if y == 0 else None, + ) + + BasePlot._fig.used_models.append(m) + return self._plot( + title=title, + legend=legend, + figsize=figsize or (500 + 100 * length, 500 + 100 * length), + plotname="plot_pareto_front", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(max_one=True)) + def plot_slice( + self, + models: INT | str | MODEL | None = None, + params: str | slice | SEQUENCE | None = None, + metric: INT | str | SEQUENCE | None = None, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the parameter relationship in a study. + + The color of the markers indicate the trial. This plot is only + available for models that ran [hyperparameter tuning][]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_slice()`. + + params: str, slice, sequence or None, default=None + Hyperparameters to plot. Use a sequence or add `+` between + options to select more than one. If None, all the model's + hyperparameters are selected. + + metric: int or str, default=None + Metric to plot (only for multi-metric runs). If str, add `+` + between options to select more than one. If None, the metric + used to run the pipeline is selected. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of hyperparameters shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:HyperparameterTuningPlot.plot_edf + atom.plots:HyperparameterTuningPlot.plot_hyperparameters + atom.plots:HyperparameterTuningPlot.plot_parallel_coordinate + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run( + models="RF", + metric=["f1", "recall"], + n_trials=15, + ) + atom.plot_slice(params=(0, 1, 2)) + ``` + + """ + m = check_hyperparams(models, "plot_slice")[0] + params = self._get_hyperparams(params, models) + metric = self._get_metric(metric, max_one=False) + + fig = self._get_figure() + for i in range(len(params) * len(metric)): + x, y = i // len(params), i % len(params) + + # Calculate the distance between subplots + x_offset = divide(0.0125, (len(params) - 1)) + y_offset = divide(0.0125, (len(metric) - 1)) + + # Calculate the size of the subplot + x_size = (1 - ((x_offset * 2) * (len(params) - 1))) / len(params) + y_size = (1 - ((y_offset * 2) * (len(metric) - 1))) / len(metric) + + # Determine the position for the axes + x_pos = y * (x_size + 2 * x_offset) + y_pos = (len(metric) - x - 1) * (y_size + 2 * y_offset) + + xaxis, yaxis = BasePlot._fig.get_axes( + x=(x_pos, rnd(x_pos + x_size)), + y=(y_pos, rnd(y_pos + y_size)), + ) + + fig.add_trace( + go.Scatter( + x=m.trials.apply(lambda r: r["params"].get(params[y], None), axis=1), + y=m.trials.apply(lambda r: lst(r["score"])[x], axis=1), + mode="markers", + marker=dict( + size=self.marker_size, + color=m.trials.index, + colorscale="Teal", + line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), + ), + customdata=m.trials.index, + hovertemplate="(%{x}, %{y})Trial %{customdata}", + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + if _is_log_scale(m.study.trials, params[y]): + fig.update_layout({f"xaxis{xaxis[1:]}_type": "log"}) + + if x < len(metric) - 1: + fig.update_layout({f"xaxis{xaxis[1:]}_showticklabels": False}) + if y > 0: + fig.update_layout({f"yaxis{yaxis[1:]}_showticklabels": False}) + + self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel=params[y] if x == len(metric) - 1 else None, + ylabel=self._metric[x].name if y == 0 else None, + ) + + BasePlot._fig.used_models.append(m) + return self._plot( + title=title, + legend=legend, + figsize=figsize or (800 + 100 * len(params), 500 + 100 * len(metric)), + plotname="plot_slice", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model) + def plot_terminator_improvement( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the potentials for future objective improvement. + + This function visualizes the objective improvement potentials. + It helps to determine whether you should continue the + optimization or not. The evaluated error is also plotted. Note + that this function may take some time to compute the improvement + potentials. This plot is only available for models that ran + [hyperparameter tuning][]. + + !!! warning + * The plot_terminator_improvement method is only available + for models that ran [hyperparameter tuning][] using + cross-validation, e.g. using `ht_params={'cv': 5}`. + * This method can be slow. Results are cached to fasten + repeated calls. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models that used hyperparameter + tuning are selected. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper right", + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y) + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:HyperparameterTuningPlot.plot_pareto_front + atom.plots:HyperparameterTuningPlot.plot_timeline + atom.plots:HyperparameterTuningPlot.plot_trials + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run("RF", n_trials=10, ht_params={"cv": 5}) + atom.plot_terminator_improvement() + ``` + + """ + check_dependency("botorch") + + models = check_hyperparams(models, "plot_terminator_improvement") + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + if m._ht["cv"] > 1: + info = self._memory.cache(_get_improvement_info)(m.study, get_error=True) + else: + raise ValueError( + "The plot_terminator_improvement method is only available for " + "models that ran hyperparameter tuning using cross-validation, " + "e.g. using ht_params={'cv': 5}." + ) + + fig.add_trace( + self._draw_line( + x=m.trials.index, + y=info.improvements, + error_y=dict(type="data", array=info.errors), + mode="markers+lines", + parent=m.name, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Trial", + ylabel="Terminator improvement", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_terminator_improvement", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model) + def plot_timeline( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the timeline of a study. + + This plot is only available for models that ran + [hyperparameter tuning][]. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models that used hyperparameter + tuning are selected. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right", + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y) + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:HyperparameterTuningPlot.plot_edf + atom.plots:HyperparameterTuningPlot.plot_slice + atom.plots:HyperparameterTuningPlot.plot_terminator_improvement + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from optuna.pruners import PatientPruner + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run( + models="LGB", + n_trials=15, + ht_params={"pruner": PatientPruner(None, patience=2)}, + ) + atom.plot_timeline() + ``` + + """ + models = check_hyperparams(models, "plot_timeline") + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + _cm = { + "COMPLETE": BasePlot._fig._palette[0], # Main color + "FAIL": "rgb(255, 0, 0)", # Red + "PRUNED": "rgb(255, 165, 0)", # Orange + "RUNNING": "rgb(124, 252, 0)", # Green + "WAITING": "rgb(220, 220, 220)", # Gray + } + + for m in models: + info = [] + for trial in m.study.get_trials(deepcopy=False): + date_complete = trial.datetime_complete or datetime.now() + date_start = trial.datetime_start or date_complete + + # Create nice representation of scores and params for hover + s = [f'{m}: {trial.values[i]}' for i, m in enumerate(self._metric.keys())] + p = [f" --> {k}: {v}" for k, v in trial.params.items()] + + info.append( + Bunch( + number=trial.number, + start=date_start, + duration=1000 * (date_complete - date_start).total_seconds(), + state=trial.state, + hovertext=( + f"Trial: {trial.number}
" + f"{'
'.join(s)}" + f"Parameters:
{'
'.join(p)}" + ) + ) + ) + + for state in sorted(TrialState, key=lambda x: x.name): + if bars := list(filter(lambda x: x.state == state, info)): + fig.add_trace( + go.Bar( + name=state.name, + x=[b.duration for b in bars], + y=[b.number for b in bars], + base=[b.start.isoformat() for b in bars], + text=[b.hovertext for b in bars], + textposition="none", + hovertemplate=f"%{{text}}{m.name}", + orientation="h", + marker=dict( + color=f"rgba({_cm[state.name][4:-1]}, 0.2)", + line=dict(width=2, color=_cm[state.name]), + ), + showlegend=BasePlot._fig.showlegend(_cm[state.name], legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout({f"xaxis{yaxis[1:]}_type": "date", "barmode": "group"}) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Datetime", + ylabel="Trial", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_timeline", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model) + def plot_trials( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + metric: INT | str | SEQUENCE | None = None, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper left", + figsize: tuple[INT, INT] = (900, 800), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the hyperparameter tuning trials. + + Creates a figure with two plots: the first plot shows the score + of every trial and the second shows the distance between the + last consecutive steps. The best trial is indicated with a star. + This is the same plot as produced by `ht_params={"plot": True}`. + This plot is only available for models that ran + [hyperparameter tuning][]. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models that used hyperparameter + tuning are selected. + + metric: int, str, sequence or None, default=None + Metric to plot (only for multi-metric runs). Add `+` between + options to select more than one. If None, all metrics are + selected. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper left" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 800) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_evals + atom.plots:HyperparameterTuningPlot.plot_hyperparameters + atom.plots:PredictionPlot.plot_results + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["ET", "RF"], n_trials=15) + atom.plot_trials() + ``` + + """ + models = check_hyperparams(models, "plot_trials") + metric = self._get_metric(metric, max_one=False) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes(y=(0.31, 1.0)) + xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.0, 0.29)) + for m in models: + for met in metric: + y = m.trials["score"].apply(lambda value: lst(value)[met]) + + # Create star symbol at best trial + symbols = ["circle"] * len(y) + symbols[m.best_trial.number] = "star" + sizes = [self.marker_size] * len(y) + sizes[m.best_trial.number] = self.marker_size * 1.5 + + fig.add_trace( + self._draw_line( + x=list(range(len(y))), + y=y, + mode="lines+markers", + marker_symbol=symbols, + marker_size=sizes, + hovertemplate=None, + parent=m.name, + child=self._metric[met].name, + legend=legend, + xaxis=xaxis2, + yaxis=yaxis, + ) + ) + + fig.add_trace( + self._draw_line( + x=list(range(1, len(y))), + y=np.abs(np.diff(y)), + mode="lines+markers", + marker_symbol="circle", + parent=m.name, + child=self._metric[met].name, + legend=legend, + xaxis=xaxis2, + yaxis=yaxis2, + ) + ) + + fig.update_layout( + { + f"yaxis{yaxis[1:]}_anchor": f"x{xaxis2[1:]}", + f"xaxis{xaxis[1:]}_showticklabels": False, + "hovermode": "x unified", + }, + ) + + self._plot( + ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"), + xlabel="Trial", + ylabel="d", + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + groupclick="togglegroup", + ylabel="Score", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_trials", + filename=filename, + display=display, + ) diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py new file mode 100644 index 000000000..22ef8a691 --- /dev/null +++ b/atom/plots/predictionplot.py @@ -0,0 +1,3546 @@ +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module containing the PredictionPlot class. + +""" + +from __future__ import annotations + +from collections import defaultdict +from functools import reduce +from itertools import chain + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from joblib import Parallel, delayed +from plotly.colors import unconvert_from_RGB_255, unlabel_rgb +from scipy import stats +from scipy.stats.mstats import mquantiles +from sklearn.calibration import calibration_curve +from sklearn.inspection import partial_dependence, permutation_importance +from sklearn.metrics import ( + confusion_matrix, det_curve, precision_recall_curve, roc_curve, +) +from sklearn.utils import _safe_indexing +from sklearn.utils.metaestimators import available_if +from sktime.forecasting.base import ForecastingHorizon +from typeguard import typechecked + +from atom.plots.base import BasePlot +from atom.utils.constants import PALETTE +from atom.utils.types import ( + FEATURES, FLOAT, INT, LEGEND, METRIC_SELECTOR, MODEL, SCALAR, SEQUENCE, + SLICE, +) +from atom.utils.utils import ( + bk, check_canvas, check_dependency, check_predict_proba, composed, crash, + divide, get_best_score, get_custom_scorer, has_task, is_binary, + is_multioutput, lst, plot_from_model, rnd, +) + + +@typechecked +class PredictionPlot(BasePlot): + """Prediction plots. + + Plots that use the model's predictions. These plots are accessible + from the runners or from the models. If called from a runner, the + `models` parameter has to be specified (if None, uses all models). + If called from a model, that model is used and the `models` parameter + becomes unavailable. + + """ + + @available_if(has_task(["binary", "multilabel"])) + @composed(crash, plot_from_model) + def plot_calibration( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str | SEQUENCE = "test", + n_bins: INT = 10, + target: INT | str = 0, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = "upper left", + figsize: tuple[INT, INT] = (900, 900), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the calibration curve for a binary classifier. + + Well calibrated classifiers are probabilistic classifiers for + which the output of the `predict_proba` method can be directly + interpreted as a confidence level. For instance a well + calibrated (binary) classifier should classify the samples such + that among the samples to which it gave a `predict_proba` value + close to 0.8, approx. 80% actually belong to the positive class. + Read more in sklearn's [documentation][calibration]. + + This figure shows two plots: the calibration curve, where the + x-axis represents the average predicted probability in each bin + and the y-axis is the fraction of positives, i.e. the proportion + of samples whose class is the positive class (in each bin); and + a distribution of all predicted probabilities of the classifier. + This plot is available only for models with a `predict_proba` + method in a binary or [multilabel][] classification task. + + !!! tip + Use the [calibrate][adaboost-calibrate] method to calibrate + the winning model. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str or sequence, default="test" + Data set on which to calculate the metric. Use a sequence + or add `+` between options to select more than one. Choose + from: "train", "test" or "holdout". + + target: int or str, default=0 + Target column to look at. Only for [multilabel][] tasks. + + n_bins: int, default=10 + Number of bins used for calibration. Minimum of 5 required. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper left" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 900) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_lift + atom.plots:PredictionPlot.plot_prc + atom.plots:PredictionPlot.plot_roc + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["RF", "LGB"]) + atom.plot_calibration() + ``` + + """ + check_predict_proba(models, "plot_calibration") + dataset = self._get_set(dataset, max_one=False) + target = self.branch._get_target(target, only_columns=True) + + if n_bins < 5: + raise ValueError( + "Invalid value for the n_bins parameter." + f"Value should be >=5, got {n_bins}." + ) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes(y=(0.31, 1.0)) + xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.0, 0.29)) + for m in models: + for ds in dataset: + y_true, y_pred = m._get_pred(ds, target, attr="predict_proba") + + # Get calibration (frac of positives and predicted values) + frac_pos, pred = calibration_curve(y_true, y_pred, n_bins=n_bins) + + fig.add_trace( + self._draw_line( + x=pred, + y=frac_pos, + parent=m.name, + child=ds, + mode="lines+markers", + marker_symbol="circle", + legend=legend, + xaxis=xaxis2, + yaxis=yaxis, + ) + ) + + fig.add_trace( + go.Histogram( + x=y_pred, + xbins=dict(start=0, end=1, size=1. / n_bins), + marker=dict( + color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", + line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), + ), + name=m.name, + legendgroup=m.name, + showlegend=False, + xaxis=xaxis2, + yaxis=yaxis2, + ) + ) + + self._draw_straight_line(y="diagonal", xaxis=xaxis2, yaxis=yaxis) + + fig.update_layout( + { + f"yaxis{yaxis[1:]}_anchor": f"x{xaxis2[1:]}", + f"xaxis{xaxis2[1:]}_showgrid": True, + "barmode": "overlay", + } + ) + + self._plot( + ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"), + xlabel="Predicted value", + ylabel="Count", + xlim=(0, 1), + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + groupclick="togglegroup", + ylabel="Fraction of positives", + ylim=(-0.05, 1.05), + title=title, + legend=legend, + figsize=figsize, + plotname="plot_calibration", + filename=filename, + display=display, + ) + + @available_if(has_task("class")) + @composed(crash, plot_from_model) + def plot_confusion_matrix( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str = "test", + target: INT | str = 0, + threshold: FLOAT = 0.5, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper right", + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot a model's confusion matrix. + + For one model, the plot shows a heatmap. For multiple models, + it compares TP, FP, FN and TN in a barplot (not implemented + for multiclass classification tasks). This plot is available + only for classification tasks. + + !!! tip + Fill the `threshold` parameter with the result from the + model's `get_best_threshold` method to optimize the results. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str, default="test" + Data set on which to calculate the confusion matrix. Choose + from:` "train", "test" or "holdout". + + target: int or str, default=0 + Target column to look at. Only for [multioutput tasks][]. + + threshold: float, default=0.5 + Threshold between 0 and 1 to convert predicted probabilities + to class labels. Only for binary classification tasks. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the plot's type. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_calibration + atom.plots:PredictionPlot.plot_threshold + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, test_size=0.4) + atom.run(["LR", "RF"]) + atom.lr.plot_confusion_matrix() # For one model + atom.plot_confusion_matrix() # For multiple models + ``` + + """ + ds = self._get_set(dataset, max_one=True) + target = self.branch._get_target(target, only_columns=True) + + if self.task.startswith("multiclass") and len(models) > 1: + raise NotImplementedError( + "The plot_confusion_matrix method does not support " + "the comparison of multiple models for multiclass " + "or multiclass-multioutput classification tasks." + ) + + labels = np.array( + (("True negatives", "False positives"), ("False negatives", "True positives")) + ) + + fig = self._get_figure() + if len(models) == 1: + xaxis, yaxis = BasePlot._fig.get_axes( + x=(0, 0.87), + coloraxis=dict( + colorscale="Blues", + cmin=0, + cmax=100, + title="Percentage of samples", + font_size=self.label_fontsize, + ), + ) + else: + xaxis, yaxis = BasePlot._fig.get_axes() + + for m in models: + y_true, y_pred = m._get_pred(ds, target, attr="predict") + if threshold != 0.5: + y_pred = (y_pred > threshold).astype("int") + + cm = confusion_matrix(y_true, y_pred) + if len(models) == 1: # Create matrix heatmap + ticks = m.mapping.get(target, np.unique(m.dataset[target]).astype(str)) + xaxis, yaxis = BasePlot._fig.get_axes( + x=(0, 0.87), + coloraxis=dict( + colorscale="Blues", + cmin=0, + cmax=100, + title="Percentage of samples", + font_size=self.label_fontsize, + ), + ) + + fig.add_trace( + go.Heatmap( + x=ticks, + y=ticks, + z=100. * cm / cm.sum(axis=1)[:, np.newaxis], + coloraxis=f"coloraxis{xaxis[1:]}", + text=cm, + customdata=labels, + texttemplate="%{text}
(%{z:.2f}%)", + textfont=dict(size=self.label_fontsize), + hovertemplate=( + "%{customdata}
" if is_binary(self.task) else "" + "x:%{x}
y:%{y}
z:%{z}" + ), + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout( + { + "template": "plotly_white", + f"yaxis{yaxis[1:]}_autorange": "reversed", + f"xaxis{xaxis[1:]}_showgrid": False, + f"yaxis{yaxis[1:]}_showgrid": False, + } + ) + + else: + color = BasePlot._fig.get_elem(m.name) + fig.add_trace( + go.Bar( + x=cm.ravel(), + y=labels.ravel(), + orientation="h", + marker=dict( + color=f"rgba({color[4:-1]}, 0.2)", + line=dict(width=2, color=color), + ), + hovertemplate="%{x}", + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout(bargroupgap=0.05) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Predicted label" if len(models) == 1 else "Count", + ylabel="True label" if len(models) == 1 else None, + title=title, + legend=legend, + figsize=figsize or ((800, 800) if len(models) == 1 else (900, 600)), + plotname="plot_confusion_matrix", + filename=filename, + display=display, + ) + + @available_if(has_task(["binary", "multilabel"])) + @composed(crash, plot_from_model) + def plot_det( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str | SEQUENCE = "test", + target: INT | str = 0, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ): + """Plot the Detection Error Tradeoff curve. + + Read more about [DET][] in sklearn's documentation. Only + available for binary classification tasks. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str or sequence, default="test" + Data set on which to calculate the metric. Use a sequence + or add `+` between options to select more than one. Choose + from: "train", "test" or "holdout". + + target: int or str, default=0 + Target column to look at. Only for [multilabel][] tasks. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_gains + atom.plots:PredictionPlot.plot_roc + atom.plots:PredictionPlot.plot_prc + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["LR", "RF"]) + atom.plot_det() + ``` + + """ + dataset = self._get_set(dataset, max_one=False) + target = self.branch._get_target(target, only_columns=True) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + for ds in dataset: + # Get fpr-fnr pairs for different thresholds + fpr, fnr, _ = det_curve(*m._get_pred(ds, target, attr="thresh")) + + fig.add_trace( + self._draw_line( + x=fpr, + y=fnr, + mode="lines", + parent=m.name, + child=ds, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="FPR", + ylabel="FNR", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_det", + filename=filename, + display=display, + ) + + @available_if(has_task("reg")) + @composed(crash, plot_from_model) + def plot_errors( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str = "test", + target: INT | str = 0, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot a model's prediction errors. + + Plot the actual targets from a set against the predicted values + generated by the regressor. A linear fit is made on the data. + The gray, intersected line shows the identity line. This plot + can be useful to detect noise or heteroscedasticity along a + range of the target domain. This plot is available only for + regression tasks. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str, default="test" + Data set on which to calculate the metric. Choose from: + "train", "test" or "holdout". + + target: int or str, default=0 + Target column to look at. Only for [multioutput tasks][]. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_residuals + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import load_diabetes + + X, y = load_diabetes(return_X_y=True, as_frame=True) + + atom = ATOMRegressor(X, y) + atom.run(["OLS", "LGB"]) + atom.plot_errors() + ``` + + """ + ds = self._get_set(dataset, max_one=True) + target = self.branch._get_target(target, only_columns=True) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + y_true, y_pred = m._get_pred(ds, target) + + fig.add_trace( + go.Scatter( + x=y_true, + y=y_pred, + mode="markers", + line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + # Fit the points using linear regression + from atom.models import OrdinaryLeastSquares + model = OrdinaryLeastSquares(goal=self.goal, branch=m.branch)._get_est() + model.fit(y_true.values.reshape(-1, 1), y_pred) + + fig.add_trace( + go.Scatter( + x=(x := np.linspace(y_true.min(), y_true.max(), 100)), + y=model.predict(x[:, np.newaxis]), + mode="lines", + line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), + hovertemplate="(%{x}, %{y})", + legendgroup=m.name, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + groupclick="togglegroup", + xlabel="True value", + title=title, + legend=legend, + ylabel="Predicted value", + figsize=figsize, + plotname="plot_errors", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(ensembles=False)) + def plot_evals( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str | SEQUENCE = "test", + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot evaluation curves. + + The evaluation curves are the main metric scores achieved by the + models at every iteration of the training process. This plot is + available only for models that allow [in-training validation][]. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str or sequence, default="test" + Data set on which to calculate the evaluation curves. Use a + sequence or add `+` between options to select more than one. + Choose from: "train" or "test". + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:HyperparameterTuningPlot.plot_trials + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["XGB", "LGB"]) + atom.plot_evals() + ``` + + """ + dataset = self._get_set(dataset, max_one=False, allow_holdout=False) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + if not m.evals: + raise ValueError( + "Invalid value for the models parameter. Model " + f"{m.name} has no in-training validation." + ) + + for ds in dataset: + fig.add_trace( + self._draw_line( + x=list(range(len(m.evals[f"{self._metric[0].name}_{ds}"]))), + y=m.evals[f"{self._metric[0].name}_{ds}"], + marker_symbol="circle", + parent=m.name, + child=ds, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + BasePlot._fig.used_models.append(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Iterations", + ylabel=self._metric[0].name, + title=title, + legend=legend, + figsize=figsize, + plotname="plot_evals", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model) + def plot_feature_importance( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + show: INT | None = None, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot a model's feature importance. + + The sum of importances for all features (per model) is 1. + This plot is available only for models whose estimator has + a `scores_`, `feature_importances_` or `coef` attribute. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + show: int or None, default=None + Number of features (ordered by importance) to show. If + None, it shows all features. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of features shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_parshap + atom.plots:PredictionPlot.plot_partial_dependence + atom.plots:PredictionPlot.plot_permutation_importance + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["LR", "RF"]) + atom.plot_feature_importance(show=10) + ``` + + """ + show = self._get_show(show, models) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + if (fi := m.feature_importance) is None: + raise ValueError( + "Invalid value for the models parameter. The estimator " + f"{m.estimator.__class__.__name__} has no feature_importances_ " + "nor coef_ attribute." + ) + + fig.add_trace( + go.Bar( + x=fi, + y=fi.index, + orientation="h", + marker=dict( + color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", + line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), + ), + hovertemplate="%{x}", + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout( + { + f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"), + "bargroupgap": 0.05, + } + ) + + # Unique number of features over all branches + n_fxs = len(set([fx for m in models for fx in m.features])) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Normalized feature importance", + ylim=(n_fxs - show - 0.5, n_fxs - 0.5), + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + plotname="plot_feature_importance", + filename=filename, + display=display, + ) + + @available_if(has_task("forecast")) + @composed(crash, plot_from_model(check_fitted=False)) + def plot_forecast( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + fh: int | str | range | SEQUENCE | ForecastingHorizon = "test", + X: FEATURES | None = None, + target: INT | str = 0, + plot_interval: bool = True, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper left", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot a time series with model forecasts. + + This plot is only available for forecasting tasks. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. If no + models are selected, only the target column is plotted. + + fh: int, str, range, sequence or [ForecastingHorizon][], default="test" + Forecast horizon for which to plot the predictions. If + string, choose from: "train", "test" or "holdout". Use a + sequence or add `+` between options to select more than one. + + X: dataframe-like or None, default=None + Exogenous time series corresponding to fh. This parameter + is ignored if fh is a data set. + + target: int or str, default=0 + Target column to look at. Only for [multivariate][] tasks. + + plot_interval: bool, default=True + Whether to plot prediction intervals instead of the exact + prediction values. If True, the plotted estimators should + have a `predict_interval` method. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper left" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_lift + atom.plots:PredictionPlot.plot_prc + atom.plots:PredictionPlot.plot_roc + + Examples + -------- + ```pycon + from atom import ATOMForecaster + from sktime.datasets import load_airline + + y = load_airline() + + atom = ATOMForecaster(y, random_state=1) + atom.plot_forecast() + atom.run( + models="arima", + est_params={"order": (1, 1, 0), "seasonal_order": (0, 1, 0, 12)}, + ) + atom.plot_forecast() + atom.plot_forecast(fh="train+test", plot_interval=False) + + # Forecast the next 4 years starting from the test set + atom.plot_forecast(fh=range(1, 48)) + ``` + + """ + target = self.branch._get_target(target, only_columns=True) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + # Draw original time series + for ds in ("train", "test"): + fig.add_trace( + go.Scatter( + x=self._get_plot_index(getattr(self, ds)), + y=getattr(self, ds)[target], + mode="lines+markers", + line=dict( + width=2, + color="black", + dash=BasePlot._fig.get_elem(ds, "dash"), + ), + opacity=0.6, + name=ds, + showlegend=False if models else BasePlot._fig.showlegend(ds, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + # Draw predictions + for m in models: + if isinstance(fh, str): + # Get fh and corresponding X from data set + datasets = self._get_set(fh, max_one=False) + fh = bk.concat([getattr(m, ds) for ds in datasets]).index + X = m.X.loc[fh] + + y_pred = m.predict(fh, X) + if is_multioutput(self.task): + y_pred = y_pred[target] + + fig.add_trace( + self._draw_line( + x=self._get_plot_index(y_pred), + y=y_pred, + mode="lines+markers", + parent=m.name, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + if plot_interval: + try: + y_pred = m.predict_interval(fh, X) + except NotImplementedError: + continue # Fails for some models like ES + + if is_multioutput(self.task): + # Select interval of target column for multivariate + y = y_pred.iloc[:, y_pred.columns.get_loc(target)] + else: + y = y_pred # Univariate + + fig.add_traces( + [ + go.Scatter( + x=self._get_plot_index(y_pred), + y=y.iloc[:, 1], + mode="lines", + line=dict(width=1, color=BasePlot._fig.get_elem(m.name)), + hovertemplate=f"%{{y}}{m.name} - upper bound", + legendgroup=m.name, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + go.Scatter( + x=self._get_plot_index(y_pred), + y=y.iloc[:, 0], + mode="lines", + line=dict(width=1, color=BasePlot._fig.get_elem(m.name)), + fill="tonexty", + fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)", + hovertemplate=f"%{{y}}{m.name} - lower bound", + legendgroup=m.name, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ) + ] + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + groupclick="togglegroup" if plot_interval else "toggleitem", + xlabel=self.y.index.name, + ylabel=target, + title=title, + legend=legend, + figsize=figsize, + plotname="plot_forecast", + filename=filename, + display=display, + ) + + @available_if(has_task(["binary", "multilabel"])) + @composed(crash, plot_from_model) + def plot_gains( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str | SEQUENCE = "test", + target: INT | str = 0, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the cumulative gains curve. + + This plot is available only for binary and [multilabel][] + classification tasks. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str or sequence, default="test" + Data set on which to calculate the metric. Use a sequence + or add `+` between options to select more than one. Choose + from: "train", "test" or "holdout". + + target: int or str, default=0 + Target column to look at. Only for [multilabel][] tasks. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_det + atom.plots:PredictionPlot.plot_lift + atom.plots:PredictionPlot.plot_roc + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["LR", "RF"]) + atom.plot_gains() + ``` + + """ + dataset = self._get_set(dataset, max_one=False) + target = self.branch._get_target(target, only_columns=True) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + for ds in dataset: + y_true, y_pred = m._get_pred(ds, target, attr="thresh") + + fig.add_trace( + self._draw_line( + x=np.arange(start=1, stop=len(y_true) + 1) / len(y_true), + y=np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum(), + mode="lines", + parent=m.name, + child=ds, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Fraction of sample", + ylabel="Gain", + xlim=(0, 1), + ylim=(0, 1.02), + title=title, + legend=legend, + figsize=figsize, + plotname="plot_gains", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(ensembles=False)) + def plot_learning_curve( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + metric: INT | str | SEQUENCE | None = None, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the learning curve: score vs number of training samples. + + This plot is available only for models fitted using + [train sizing][]. [Ensembles][] are ignored. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + metric: int, str, sequence or None, default=None + Metric to plot (only for multi-metric runs). Use a sequence + or add `+` between options to select more than one. If None, + the metric used to run the pipeline is selected. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_results + atom.plots:PredictionPlot.plot_successive_halving + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.train_sizing(["LR", "RF"], n_bootstrap=5) + atom.plot_learning_curve() + ``` + + """ + metric = self._get_metric(metric, max_one=False) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + for met in metric: + x, y, std = defaultdict(list), defaultdict(list), defaultdict(list) + for m in models: + x[m._group].append(m._train_idx) + y[m._group].append(get_best_score(m, met)) + if m.bootstrap is not None: + std[m._group].append(m.bootstrap.iloc[:, met].std()) + + for group in x: + fig.add_trace( + self._draw_line( + x=x[group], + y=y[group], + mode="lines+markers", + marker_symbol="circle", + error_y=dict(type="data", array=std[group], visible=True), + parent=group, + child=self._metric[met].name, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + # Add error bands + if m.bootstrap is not None: + fillcolor = f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)" + fig.add_traces( + [ + go.Scatter( + x=x[group], + y=np.add(y[group], std[group]), + mode="lines", + line=dict(width=1, color=BasePlot._fig.get_elem(group)), + hovertemplate="%{y}upper bound", + legendgroup=group, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + go.Scatter( + x=x[group], + y=np.subtract(y[group], std[group]), + mode="lines", + line=dict(width=1, color=BasePlot._fig.get_elem(group)), + fill="tonexty", + fillcolor=fillcolor, + hovertemplate="%{y}lower bound", + legendgroup=group, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + ] + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + groupclick="togglegroup", + title=title, + legend=legend, + xlabel="Number of training samples", + ylabel="Score", + figsize=figsize, + plotname="plot_learning_curve", + filename=filename, + display=display, + ) + + @available_if(has_task(["binary", "multilabel"])) + @composed(crash, plot_from_model) + def plot_lift( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str | SEQUENCE = "test", + target: INT | str = 0, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the lift curve. + + Only available for binary classification tasks. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str or sequence, default="test" + Data set on which to calculate the metric. Use a sequence + or add `+` between options to select more than one. Choose + from: "train", "test" or "holdout". + + target: int or str, default=0 + Target column to look at. Only for [multilabel][] tasks. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_det + atom.plots:PredictionPlot.plot_gains + atom.plots:PredictionPlot.plot_prc + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["LR", "RF"]) + atom.plot_lift() + ``` + + """ + dataset = self._get_set(dataset, max_one=False) + target = self.branch._get_target(target, only_columns=True) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + for ds in dataset: + y_true, y_pred = m._get_pred(ds, target, attr="thresh") + + gains = np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum() + fig.add_trace( + self._draw_line( + x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)), + y=gains / x, + mode="lines", + parent=m.name, + child=ds, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + self._draw_straight_line(y=1, xaxis=xaxis, yaxis=yaxis) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Fraction of sample", + ylabel="Lift", + xlim=(0, 1), + title=title, + legend=legend, + figsize=figsize, + plotname="plot_lift", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model) + def plot_parshap( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + columns: SLICE | None = None, + target: INT | str | tuple = 1, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper left", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the partial correlation of shap values. + + Plots the train and test correlation between the shap value of + every feature with its target value, after removing the effect + of all other features (partial correlation). This plot is + useful to identify the features that are contributing most to + overfitting. Features that lie below the bisector (diagonal + line) performed worse on the test set than on the training set. + If the estimator has a `scores_`, `feature_importances_` or + `coef_` attribute, its normalized values are shown in a color + map. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + columns: int, str, slice, sequence or None, default=None + Features to plot. If None, it plots all features. + + target: int, str or tuple, default=1 + Class in the target column to target. For multioutput tasks, + the value should be a tuple of the form (column, class). + Note that for binary and multilabel tasks, the selected + class is always the positive one. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper left" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_feature_importance + atom.plots:PredictionPlot.plot_partial_dependence + atom.plots:PredictionPlot.plot_permutation_importance + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["GNB", "RF"]) + atom.rf.plot_parshap(legend=None) + atom.plot_parshap(columns=slice(5, 10)) + ``` + + """ + target = self.branch._get_target(target) + + fig = self._get_figure() + + # Colorbar is only needed when a model has feature_importance + if all(m.feature_importance is None for m in models): + xaxis, yaxis = BasePlot._fig.get_axes() + else: + xaxis, yaxis = BasePlot._fig.get_axes( + x=(0, 0.87), + coloraxis=dict( + colorscale="Reds", + title="Normalized feature importance", + font_size=self.label_fontsize, + ) + ) + + for m in models: + parshap = {} + fxs = m.branch._get_columns(columns, include_target=False) + + for ds in ("train", "test"): + # Calculating shap values is computationally expensive, + # therefore select a random subsample for large data sets + if len(data := getattr(m, ds)) > 500: + data = data.sample(500, random_state=self.random_state) + + # Replace data with the calculated shap values + explanation = m._shap.get_explanation(data[m.features], target) + data[m.features] = explanation.values + + parshap[ds] = pd.Series(index=fxs, dtype=float) + for fx in fxs: + # All other features are covariates + covariates = [f for f in data.columns[:-1] if f != fx] + cols = [fx, data.columns[-1], *covariates] + + # Compute covariance + V = data[cols].cov() + + # Inverse covariance matrix + Vi = np.linalg.pinv(V, hermitian=True) + diag = Vi.diagonal() + + D = np.diag(np.sqrt(1 / diag)) + + # Partial correlation matrix + partial_corr = -1 * (D @ Vi @ D) # @ is matrix multiplication + + # Semi-partial correlation matrix + with np.errstate(divide="ignore"): + V_sqrt = np.sqrt(np.diag(V))[..., None] + Vi_sqrt = np.sqrt(np.abs(diag - Vi ** 2 / diag[..., None])).T + semi_partial_correlation = partial_corr / V_sqrt / Vi_sqrt + + # X covariates are removed + parshap[ds][fx] = semi_partial_correlation[1, 0] + + # Get the feature importance or coefficients + if m.feature_importance is not None: + color = m.feature_importance.loc[fxs] + else: + color = BasePlot._fig.get_elem("parshap") + + fig.add_trace( + go.Scatter( + x=parshap["train"], + y=parshap["test"], + mode="markers+text", + marker=dict( + color=color, + size=self.marker_size, + coloraxis=f"coloraxis{xaxis[1:]}", + line=dict(width=1, color="rgba(255, 255, 255, 0.9)"), + ), + text=m.features, + textposition="top center", + customdata=(data := None if isinstance(color, str) else list(color)), + hovertemplate=( + f"%{{text}}
(%{{x}}, %{{y}})" + f"{'
Feature importance: %{customdata:.4f}' if data else ''}" + f"{m.name}" + ), + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Training set", + ylabel="Test set", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_parshap", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model) + def plot_partial_dependence( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + columns: SLICE | None = None, + kind: str | SEQUENCE = "average", + pair: int | str | None = None, + target: INT | str = 1, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the partial dependence of features. + + The partial dependence of a feature (or a set of features) + corresponds to the response of the model for each possible + value of the feature. The plot can take two forms: + + - If `pair` is None: Single feature partial dependence lines. + The deciles of the feature values are shown with tick marks + on the bottom. + - If `pair` is defined: Two-way partial dependence plots are + plotted as contour plots (only allowed for a single model). + + Read more about partial dependence on sklearn's + [documentation][partial_dependence]. This plot is not available + for multilabel nor multiclass-multioutput classification tasks. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + columns: int, str, slice, sequence or None, default=None + Features to get the partial dependence from. If None, it + uses the first 3 features in the dataset. + + kind: str or sequence, default="average" + Kind of depedence to plot. Use a sequence or add `+` between + options to select more than one. Choose from: + + - "average": Partial dependence averaged across all samples + in the dataset. + - "individual": Partial dependence for up to 50 random + samples (Individual Conditional Expectation). + + This parameter is ignored when plotting feature pairs. + + pair: int, str or None, default=None + Feature with which to pair the features selected by + `columns`. If specified, the resulting figure displays + contour plots. Only allowed when plotting a single model. + If None, the plots show the partial dependece of single + features. + + target: int or str, default=1 + Class in the target column to look at (only for multiclass + classification tasks). + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_feature_importance + atom.plots:PredictionPlot.plot_parshap + atom.plots:PredictionPlot.plot_permutation_importance + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["LR", "RF"]) + atom.plot_partial_dependence(kind="average+individual", legend="upper left") + atom.rf.plot_partial_dependence(columns=(3, 4), pair=2) + ``` + + """ + if any(self.task.startswith(t) for t in ("multilabel", "multiclass-multioutput")): + raise PermissionError( + "The plot_partial_dependence method is not available for multilabel " + f"nor multiclass-multioutput classification tasks, got {self.task}." + ) + elif self.task.startswith("multiclass"): + _, target = self.branch._get_target(target) + else: + target = 0 + + kind = "+".join(lst(kind)).lower() + if any(k not in ("average", "individual") for k in kind.split("+")): + raise ValueError( + f"Invalid value for the kind parameter, got {kind}. " + "Choose from: average, individual." + ) + + axes, names = [], [] + fig = self._get_figure() + for m in models: + color = BasePlot._fig.get_elem(m.name) + + # Since every model can have different fxs, select them + # every time and make sure the models use the same fxs + cols = m.branch._get_columns( + columns=(0, 1, 2) if columns is None else columns, + include_target=False, + ) + + if not names: + names = cols + elif names != cols: + raise ValueError( + "Invalid value for the columns parameter. Not all " + f"models use the same features, got {names} and {cols}." + ) + + if pair is not None: + if len(models) > 1: + raise ValueError( + f"Invalid value for the pair parameter, got {pair}. " + "The value must be None when plotting multiple models" + ) + else: + pair = m.branch._get_columns(pair, include_target=False) + cols = [(c, pair[0]) for c in cols] + else: + cols = [(c,) for c in cols] + + # Create new axes + if not axes: + for i, col in enumerate(cols): + # Calculate the distance between subplots + offset = divide(0.025, len(cols) - 1) + + # Calculate the size of the subplot + size = (1 - ((offset * 2) * (len(cols) - 1))) / len(cols) + + # Determine the position for the axes + x_pos = i % len(cols) * (size + 2 * offset) + + xaxis, yaxis = BasePlot._fig.get_axes(x=(x_pos, rnd(x_pos + size))) + axes.append((xaxis, yaxis)) + + # Compute averaged predictions + predictions = Parallel(n_jobs=self.n_jobs, backend=self.backend)( + delayed(partial_dependence)( + estimator=m.estimator, + X=m.X_test, + features=col, + kind="both" if "individual" in kind else "average", + ) for col in cols + ) + + # Compute deciles for ticks (only if line plots) + if len(cols[0]) == 1: + deciles = {} + for fx in chain.from_iterable(cols): + if fx not in deciles: # Skip if the feature is repeated + X_col = _safe_indexing(m.X_test, fx, axis=1) + deciles[fx] = mquantiles(X_col, prob=np.arange(0.1, 1.0, 0.1)) + + for i, (ax, fx, pred) in enumerate(zip(axes, cols, predictions)): + # Draw line or contour plot + if len(pred["values"]) == 1: + # For both average and individual: draw ticks on the horizontal axis + for line in deciles[fx[0]]: + fig.add_shape( + type="line", + x0=line, + x1=line, + xref=ax[0], + y0=0, + y1=0.05, + yref=f"{axes[0][1]} domain", + line=dict(width=1, color=BasePlot._fig.get_elem(m.name)), + opacity=0.6, + layer="below", + ) + + # Draw the mean of the individual lines + if "average" in kind: + fig.add_trace( + go.Scatter( + x=pred["values"][0], + y=pred["average"][target].ravel(), + mode="lines", + line=dict(width=2, color=color), + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=ax[0], + yaxis=axes[0][1], + ) + ) + + # Draw all individual (per sample) lines (ICE) + if "individual" in kind: + # Select up to 50 random samples to plot + idx = np.random.choice( + list(range(len(pred["individual"][target]))), + size=min(len(pred["individual"][target]), 50), + replace=False, + ) + for sample in pred["individual"][target, idx, :]: + fig.add_trace( + go.Scatter( + x=pred["values"][0], + y=sample, + mode="lines", + line=dict(width=0.5, color=color), + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=ax[0], + yaxis=axes[0][1], + ) + ) + + else: + colorscale = PALETTE.get(BasePlot._fig.get_elem(m.name), "Teal") + fig.add_trace( + go.Contour( + x=pred["values"][0], + y=pred["values"][1], + z=pred["average"][target], + contours=dict( + showlabels=True, + labelfont=dict(size=self.tick_fontsize, color="white") + ), + hovertemplate="x:%{x}
y:%{y}
z:%{z}", + hoverongaps=False, + colorscale=colorscale, + showscale=False, + showlegend=False, + xaxis=ax[0], + yaxis=axes[0][1], + ) + ) + + self._plot( + ax=(f"xaxis{ax[0][1:]}", f"yaxis{ax[1][1:]}"), + xlabel=fx[0], + ylabel=(fx[1] if len(fx) > 1 else "Score") if i == 0 else None, + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + groupclick="togglegroup", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_partial_dependence", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model) + def plot_permutation_importance( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + show: INT | None = None, + n_repeats: INT = 10, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the feature permutation importance of models. + + !!! warning + This method can be slow. Results are cached to fasten + repeated calls. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + show: int or None, default=None + Number of features (ordered by importance) to show. If + None, it shows all features. + + n_repeats: int, default=10 + Number of times to permute each feature. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of features shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_feature_importance + atom.plots:PredictionPlot.plot_partial_dependence + atom.plots:PredictionPlot.plot_parshap + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["LR", "RF"]) + atom.plot_permutation_importance(show=10, n_repeats=7) + ``` + + """ + show = self._get_show(show, models) + + if n_repeats <= 0: + raise ValueError( + "Invalid value for the n_repeats parameter." + f"Value should be >0, got {n_repeats}." + ) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + for m in models: + # Permutation importances returns Bunch object + permutations = self._memory.cache(permutation_importance)( + estimator=m.estimator, + X=m.X_test, + y=m.y_test, + scoring=self._metric[0], + n_repeats=n_repeats, + n_jobs=self.n_jobs, + random_state=self.random_state, + ) + + fig.add_trace( + go.Box( + x=permutations["importances"].ravel(), + y=list(np.array([[fx] * n_repeats for fx in m.features]).ravel()), + marker_color=BasePlot._fig.get_elem(m.name), + boxpoints="outliers", + orientation="h", + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout( + { + f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"), + "boxmode": "group", + } + ) + + # Unique number of features over all branches + n_fxs = len(set([fx for m in models for fx in m.features])) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Score", + ylim=(n_fxs - show - 0.5, n_fxs - 0.5), + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + plotname="plot_permutation_importance", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(check_fitted=False)) + def plot_pipeline( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + draw_hyperparameter_tuning: bool = True, + color_branches: bool | None = None, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> plt.Figure | None: + """Plot a diagram of the pipeline. + + !!! warning + This plot uses the [schemdraw][] package, which is + incompatible with [plotly][]. The returned plot is + therefore a [matplotlib figure][pltfigure]. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models for which to draw the pipeline. If None, all + pipelines are plotted. + + draw_hyperparameter_tuning: bool, default=True + Whether to draw if the models used Hyperparameter Tuning. + + color_branches: bool or None, default=None + Whether to draw every branch in a different color. If None, + branches are colored when there is more than one. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the pipeline drawn. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as png. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [plt.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:DataPlot.plot_wordcloud + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["GNB", "RNN", "SGD", "MLP"]) + atom.voting(models=atom.winners[:2]) + atom.plot_pipeline() + + atom = ATOMClassifier(X, y, random_state=1) + atom.scale() + atom.prune() + atom.run("RF", n_trials=30) + + atom.branch = "undersample" + atom.balance("nearmiss") + atom.run("RF_undersample") + + atom.branch = "oversample_from_master" + atom.balance("smote") + atom.run("RF_oversample") + + atom.plot_pipeline() + ``` + + """ + + def get_length(pl, i): + """Get the maximum length of the name of a block.""" + if len(pl) > i: + return max(len(pl[i].__class__.__name__) * 0.5, 7) + else: + return 0 + + def check_y(xy): + """Return y unless there is something right, then jump.""" + while any(pos[0] > xy[0] and pos[1] == xy[1] for pos in positions.values()): + xy = Point((xy[0], xy[1] + height)) + + return xy[1] + + def add_wire(x, y): + """Draw a connecting wire between two estimators.""" + d.add( + Wire(shape="z", k=(x - d.here[0]) / (length + 1), arrow="->") + .to((x, y)) + .color(branch["color"]) + ) + + # Update arrowhead manually + d.elements[-1].segments[-1].arrowwidth = 0.3 + d.elements[-1].segments[-1].arrowlength = 0.5 + + check_dependency("schemdraw") + from schemdraw import Drawing + from schemdraw.flow import Data, RoundBox, Subroutine, Wire + from schemdraw.util import Point + + fig = self._get_figure(backend="matplotlib") + check_canvas(BasePlot._fig.is_canvas, "plot_pipeline") + + # Define branches to plot (if called from model, it's only one) + branches = [] + for branch in getattr(self, "_branches", [self.branch]): + draw_models, draw_ensembles = [], [] + for m in models: + if m.branch is branch: + if m.acronym not in ("Stack", "Vote"): + draw_models.append(m) + else: + draw_ensembles.append(m) + + # Additionally, add all dependent models (if not already there) + draw_models.extend([i for i in m._models if i not in draw_models]) + + if not models or draw_models: + branches.append( + { + "name": branch.name, + "pipeline": list(branch.pipeline), + "models": draw_models, + "ensembles": draw_ensembles, + } + ) + + # Define colors per branch + for branch in branches: + if color_branches or (color_branches is None and len(branches) > 1): + color = next(BasePlot._fig.palette) + + # Convert back to format accepted by matplotlib + branch["color"] = unconvert_from_RGB_255(unlabel_rgb(color)) + else: + branch["color"] = "black" + + # Create schematic drawing + d = Drawing(unit=1, backend="matplotlib") + d.config(fontsize=self.tick_fontsize) + d.add(Subroutine(w=8, s=0.7).label("Raw data")) + + height = 3 # Height of every block + length = 5 # Minimum arrow length + + # Define the x-position for every block + x_pos = [d.here[0] + length] + for i in range(max(len(b["pipeline"]) for b in branches)): + len_block = reduce(max, [get_length(b["pipeline"], i) for b in branches]) + x_pos.append(x_pos[-1] + length + len_block) + + # Add positions for scaling, hyperparameter tuning and models + x_pos.extend([x_pos[-1], x_pos[-1]]) + if any(m.scaler for m in models): + x_pos[-1] = x_pos[-2] = x_pos[-3] + length + 7 + if draw_hyperparameter_tuning and any(m.trials is not None for m in models): + x_pos[-1] = x_pos[-2] + length + 11 + + positions = {0: d.here} # Contains the position of every element + for branch in branches: + d.here = positions[0] + + for i, est in enumerate(branch["pipeline"]): + # If the estimator has already been seen, don't draw + if id(est) in positions: + # Change location to estimator's end + d.here = positions[id(est)] + continue + + # Draw transformer + add_wire(x_pos[i], check_y(d.here)) + d.add( + RoundBox(w=max(len(est.__class__.__name__) * 0.5, 7)) + .label(est.__class__.__name__, color="k") + .color(branch["color"]) + .anchor("W") + .drop("E") + ) + + positions[id(est)] = d.here + + for model in branch["models"]: + # Position at last transformer or at start + if branch["pipeline"]: + d.here = positions[id(est)] + else: + d.here = positions[0] + + # For a single branch, center models + if len(branches) == 1: + offset = height * (len(branch["models"]) - 1) / 2 + else: + offset = 0 + + # Draw automated feature scaling + if model.scaler: + add_wire(x_pos[-3], check_y((d.here[0], d.here[1] - offset))) + d.add( + RoundBox(w=7) + .label("Scaler", color="k") + .color(branch["color"]) + .drop("E") + ) + offset = 0 + + # Draw hyperparameter tuning + if draw_hyperparameter_tuning and model.trials is not None: + add_wire(x_pos[-2], check_y((d.here[0], d.here[1] - offset))) + d.add( + Data(w=11) + .label("Hyperparameter\nTuning", color="k") + .color(branch["color"]) + .drop("E") + ) + offset = 0 + + # Remove classifier/regressor from model's name + name = model.estimator.__class__.__name__ + if name.lower().endswith("classifier"): + name = name[:-10] + elif name.lower().endswith("regressor"): + name = name[:-9] + + # Draw model + add_wire(x_pos[-1], check_y((d.here[0], d.here[1] - offset))) + d.add( + Data(w=max(len(name) * 0.5, 7)) + .label(name, color="k") + .color(branch["color"]) + .anchor("W") + .drop("E") + ) + + positions[id(model)] = d.here + + # Draw ensembles + max_pos = max(pos[0] for pos in positions.values()) # Max length model names + for branch in branches: + for model in branch["ensembles"]: + # Determine y-position of the ensemble + y_pos = [positions[id(m)][1] for m in model._models] + offset = height / 2 * (len(branch["ensembles"]) - 1) + y = min(y_pos) + (max(y_pos) - min(y_pos)) * 0.5 - offset + y = check_y((max_pos + length, max(min(y_pos), y))) + + d.here = (max_pos + length, y) + + d.add( + Data(w=max(len(model._fullname) * 0.5, 7)) + .label(model._fullname, color="k") + .color(branch["color"]) + .anchor("W") + .drop("E") + ) + + positions[id(model)] = d.here + + # Draw a wire from every model to the ensemble + for m in model._models: + d.here = positions[id(m)] + add_wire(max_pos + length, y) + + if not figsize: + dpi, bbox = fig.get_dpi(), d.get_bbox() + figsize = (dpi * bbox.xmax // 4, (dpi / 2) * (bbox.ymax - bbox.ymin)) + + d.draw(canvas=plt.gca(), showframe=False, show=False) + plt.axis("off") + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=plt.gca(), + title=title, + legend=legend, + figsize=figsize, + plotname="plot_pipeline", + filename=filename, + display=display, + ) + + @available_if(has_task(["binary", "multilabel"])) + @composed(crash, plot_from_model) + def plot_prc( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str | SEQUENCE = "test", + target: INT | str = 0, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower left", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the precision-recall curve. + + Read more about [PRC][] in sklearn's documentation. Only + available for binary classification tasks. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str or sequence, default="test" + Data set on which to calculate the metric. Use a sequence + or add `+` between options to select more than one. Choose + from: "train", "test" or "holdout". + + target: int or str, default=0 + Target column to look at. Only for [multilabel][] tasks. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower left" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_det + atom.plots:PredictionPlot.plot_lift + atom.plots:PredictionPlot.plot_roc + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["LR", "RF"]) + atom.plot_prc() + ``` + + """ + dataset = self._get_set(dataset, max_one=False) + target = self.branch._get_target(target, only_columns=True) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + for ds in dataset: + y_true, y_pred = m._get_pred(ds, target, attr="thresh") + + # Get precision-recall pairs for different thresholds + prec, rec, _ = precision_recall_curve(y_true, y_pred) + + fig.add_trace( + self._draw_line( + x=rec, + y=prec, + mode="lines", + parent=m.name, + child=ds, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + self._draw_straight_line(sum(m.y_test) / len(m.y_test), xaxis=xaxis, yaxis=yaxis) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Recall", + ylabel="Precision", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_prc", + filename=filename, + display=display, + ) + + @available_if(has_task("class")) + @composed(crash, plot_from_model) + def plot_probabilities( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str = "test", + target: INT | str | tuple = 1, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the probability distribution of the target classes. + + This plot is available only for models with a `predict_proba` + method in classification tasks. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str, default="test" + Data set on which to calculate the metric. Choose from: + "train", "test" or "holdout". + + target: int, str or tuple, default=1 + Probability of being that class in the target column. For + multioutput tasks, the value should be a tuple of the form + (column, class). + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_confusion_matrix + atom.plots:PredictionPlot.plot_results + atom.plots:PredictionPlot.plot_threshold + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["LR", "RF"]) + atom.plot_probabilities() + ``` + + """ + check_predict_proba(models, "plot_probabilities") + ds = self._get_set(dataset, max_one=True) + col, cls = self.branch._get_target(target) + col = lst(self.target)[col] + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + y_true, y_pred = getattr(m, f"y_{ds}"), getattr(m, f"predict_proba_{ds}") + for value in np.unique(m.dataset[col]): + # Get indices per class + if is_multioutput(self.task): + if self.task.startswith("multilabel"): + hist = y_pred.loc[y_true[col] == value, col] + else: + hist = y_pred.loc[cls, col].loc[y_true[col] == value] + else: + hist = y_pred.loc[y_true == value, str(cls)] + + fig.add_trace( + go.Scatter( + x=(x := np.linspace(0, 1, 100)), + y=stats.gaussian_kde(hist)(x), + mode="lines", + line=dict( + width=2, + color=BasePlot._fig.get_elem(m.name), + dash=BasePlot._fig.get_elem(ds, "dash"), + ), + fill="tonexty", + fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)", + fillpattern=dict(shape=BasePlot._fig.get_elem(value, "shape")), + name=f"{col}={value}", + legendgroup=m.name, + legendgrouptitle=dict(text=m.name, font_size=self.label_fontsize), + showlegend=BasePlot._fig.showlegend(f"{m.name}-{value}", legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + groupclick="toggleitem", + xlabel="Probability", + ylabel="Probability density", + xlim=(0, 1), + title=title, + legend=legend, + figsize=figsize, + plotname="plot_probabilities", + filename=filename, + display=display, + ) + + @available_if(has_task("reg")) + @composed(crash, plot_from_model) + def plot_residuals( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str = "test", + target: INT | str = 0, + *, + title: str | dict | None = None, + legend: str | dict | None = "upper left", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot a model's residuals. + + The plot shows the residuals (difference between the predicted + and the true value) on the vertical axis and the independent + variable on the horizontal axis. The gray, intersected line + shows the identity line. This plot can be useful to analyze the + variance of the error of the regressor. If the points are + randomly dispersed around the horizontal axis, a linear + regression model is appropriate for the data; otherwise, a + non-linear model is more appropriate. This plot is only + available for regression tasks. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str, default="test" + Data set on which to calculate the metric. Choose from: + "train", "test" or "holdout". + + target: int or str, default=0 + Target column to look at. Only for [multioutput tasks][]. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="upper left" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_errors + + Examples + -------- + ```pycon + from atom import ATOMRegressor + from sklearn.datasets import load_diabetes + + X, y = load_diabetes(return_X_y=True, as_frame=True) + + atom = ATOMRegressor(X, y) + atom.run(["OLS", "LGB"]) + atom.plot_residuals() + ``` + + """ + ds = self._get_set(dataset, max_one=True) + target = self.branch._get_target(target, only_columns=True) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes(x=(0, 0.69)) + xaxis2, yaxis2 = BasePlot._fig.get_axes(x=(0.71, 1.0)) + for m in models: + y_true, y_pred = m._get_pred(ds, target) + + fig.add_trace( + go.Scatter( + x=y_true, + y=(res := np.subtract(y_true, y_pred)), + mode="markers", + line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), + name=m.name, + legendgroup=m.name, + showlegend=BasePlot._fig.showlegend(m.name, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.add_trace( + go.Histogram( + y=res, + bingroup="residuals", + marker=dict( + color=f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)", + line=dict(width=2, color=BasePlot._fig.get_elem(m.name)), + ), + name=m.name, + legendgroup=m.name, + showlegend=False, + xaxis=xaxis2, + yaxis=yaxis, + ) + ) + + self._draw_straight_line(y=0, xaxis=xaxis, yaxis=yaxis) + + fig.update_layout({f"yaxis{xaxis[1:]}_showgrid": True, "barmode": "overlay"}) + + self._plot( + ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"), + xlabel="Distribution", + title=title, + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + groupclick="togglegroup", + ylabel="Residuals", + xlabel="True value", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_residuals", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model) + def plot_results( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + metric: INT | str | SEQUENCE | None = None, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the model results. + + If all models applied bootstrap, the plot is a boxplot. If + not, the plot is a barplot. Models are ordered based on + their score from the top down. The score is either the + `score_bootstrap` or `score_test` attribute of the model, + selected in that order. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + metric: int, str, sequence or None, default=None + Metric to plot (only for multi-metric runs). Other available + options are "time_bo", "time_fit", "time_bootstrap" and + "time". If str, add `+` between options to select more than + one. If None, the metric used to run the pipeline is selected. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of models. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_confusion_matrix + atom.plots:PredictionPlot.plot_probabilities + atom.plots:PredictionPlot.plot_threshold + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["GNB", "LR", "RF", "LGB"], metric=["f1", "recall"]) + atom.plot_results() + + atom.run(["GNB", "LR", "RF", "LGB"], metric=["f1", "recall"], n_bootstrap=5) + atom.plot_results() + atom.plot_results(metric="time_fit+time") + ``` + + """ + + def get_std(model: MODEL, metric: int) -> SCALAR: + """Get the standard deviation of the bootstrap scores. + + Parameters + ---------- + model: Model + Model to get the std from. + + metric: int + Index of the metric to get it from. + + Returns + ------- + int or float + Standard deviation score or 0 if not bootstrapped. + + """ + if model.bootstrap is None: + return 0 + else: + return model.bootstrap.iloc[:, metric].std() + + metric = self._get_metric(metric, max_one=False) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + for met in metric: + if isinstance(met, str): + color = BasePlot._fig.get_elem(met) + fig.add_trace( + go.Bar( + x=[getattr(m, met) for m in models], + y=[m.name for m in models], + orientation="h", + marker=dict( + color=f"rgba({color[4:-1]}, 0.2)", + line=dict(width=2, color=color), + ), + hovertemplate=f"%{{x}}{met}", + name=met, + legendgroup=met, + showlegend=BasePlot._fig.showlegend(met, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + else: + name = self._metric[met].name + color = BasePlot._fig.get_elem() + + if all(m.score_bootstrap for m in models): + x = np.array([m.bootstrap.iloc[:, met] for m in models]).ravel() + y = np.array([[m.name] * len(m.bootstrap) for m in models]).ravel() + fig.add_trace( + go.Box( + x=x, + y=list(y), + marker_color=color, + boxpoints="outliers", + orientation="h", + name=name, + legendgroup=name, + showlegend=BasePlot._fig.showlegend(name, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + else: + fig.add_trace( + go.Bar( + x=[get_best_score(m, met) for m in models], + y=[m.name for m in models], + error_x=dict( + type="data", + array=[get_std(m, met) for m in models], + ), + orientation="h", + marker=dict( + color=f"rgba({color[4:-1]}, 0.2)", + line=dict(width=2, color=color), + ), + hovertemplate="%{x}", + name=name, + legendgroup=name, + showlegend=BasePlot._fig.showlegend(name, legend), + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + fig.update_layout( + { + f"yaxis{yaxis[1:]}": dict(categoryorder="total ascending"), + "bargroupgap": 0.05, + "boxmode": "group", + } + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="time (s)" if all(isinstance(m, str) for m in metric) else "Score", + title=title, + legend=legend, + figsize=figsize or (900, 400 + len(models) * 50), + plotname="plot_results", + filename=filename, + display=display, + ) + + @available_if(has_task(["binary", "multilabel"])) + @composed(crash, plot_from_model) + def plot_roc( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + dataset: str | SEQUENCE = "test", + target: INT | str = 0, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot the Receiver Operating Characteristics curve. + + Read more about [ROC][] in sklearn's documentation. Only + available for classification tasks. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + dataset: str or sequence, default="test" + Data set on which to calculate the metric. Use a sequence + or add `+` between options to select more than one. Choose + from: "train", "test" or "holdout". + + target: int or str, default=0 + Target column to look at. Only for [multilabel][] tasks. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_gains + atom.plots:PredictionPlot.plot_lift + atom.plots:PredictionPlot.plot_prc + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["LR", "RF"]) + atom.plot_roc() + ``` + + """ + dataset = self._get_set(dataset, max_one=False) + target = self.branch._get_target(target, only_columns=True) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + for m in models: + for ds in dataset: + # Get False (True) Positive Rate as arrays + fpr, tpr, _ = roc_curve(*m._get_pred(ds, target, attr="thresh")) + + fig.add_trace( + self._draw_line( + x=fpr, + y=tpr, + mode="lines", + parent=m.name, + child=ds, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + self._draw_straight_line(y="diagonal", xaxis=xaxis, yaxis=yaxis) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlim=(-0.03, 1.03), + ylim=(-0.03, 1.03), + xlabel="FPR", + ylabel="TPR", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_roc", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(ensembles=False)) + def plot_successive_halving( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + metric: INT | str | SEQUENCE | None = None, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower right", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot scores per iteration of the successive halving. + + Only use with models fitted using [successive halving][]. + [Ensembles][] are ignored. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + metric: int, str, sequence or None, default=None + Metric to plot (only for multi-metric runs). Use a sequence + or add `+` between options to select more than one. If None, + the metric used to run the pipeline is selected. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower right" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_learning_curve + atom.plots:PredictionPlot.plot_results + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.successive_halving(["Tree", "Bag", "RF", "LGB"], n_bootstrap=5) + atom.plot_successive_halving() + ``` + + """ + metric = self._get_metric(metric, max_one=False) + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + for met in metric: + x, y, std = defaultdict(list), defaultdict(list), defaultdict(list) + for m in models: + x[m._group].append(len(m.branch._idx[1]) // m._train_idx) + y[m._group].append(get_best_score(m, met)) + if m.bootstrap is not None: + std[m._group].append(m.bootstrap.iloc[:, met].std()) + + for group in x: + fig.add_trace( + self._draw_line( + x=x[group], + y=y[group], + mode="lines+markers", + marker_symbol="circle", + error_y=dict(type="data", array=std[group], visible=True), + parent=group, + child=self._metric[met].name, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + # Add error bands + if m.bootstrap is not None: + fillcolor = f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)" + fig.add_traces( + [ + go.Scatter( + x=x[group], + y=np.add(y[group], std[group]), + mode="lines", + line=dict(width=1, color=BasePlot._fig.get_elem(group)), + hovertemplate="%{y}upper bound", + legendgroup=group, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + go.Scatter( + x=x[group], + y=np.subtract(y[group], std[group]), + mode="lines", + line=dict(width=1, color=BasePlot._fig.get_elem(group)), + fill="tonexty", + fillcolor=fillcolor, + hovertemplate="%{y}lower bound", + legendgroup=group, + showlegend=False, + xaxis=xaxis, + yaxis=yaxis, + ), + ] + ) + + fig.update_layout({f"xaxis{yaxis[1:]}": dict(dtick=1, autorange="reversed")}) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + groupclick="togglegroup", + title=title, + legend=legend, + xlabel="n_models", + ylabel="Score", + figsize=figsize, + plotname="plot_successive_halving", + filename=filename, + display=display, + ) + + @available_if(has_task(["binary", "multilabel"])) + @composed(crash, plot_from_model) + def plot_threshold( + self, + models: INT | str | MODEL | slice | SEQUENCE | None = None, + metric: METRIC_SELECTOR = None, + dataset: str = "test", + target: INT | str = 0, + steps: INT = 100, + *, + title: str | dict | None = None, + legend: str | dict | None = "lower left", + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> go.Figure | None: + """Plot metric performances against threshold values. + + This plot is available only for models with a `predict_proba` + method in a binary or [multilabel][] classification task. + + Parameters + ---------- + models: int, str, Model, slice, sequence or None, default=None + Models to plot. If None, all models are selected. + + metric: str, func, scorer, sequence or None, default=None + Metric to plot. Choose from any of sklearn's scorers, a + function with signature `metric(y_true, y_pred)`, a scorer + object or a sequence of these. Use a sequence or add `+` + between options to select more than one. If None, the + metric used to run the pipeline is selected. + + dataset: str, default="test" + Data set on which to calculate the metric. Choose from: + "train", "test" or "holdout". + + target: int or str, default=0 + Target column to look at. Only for [multilabel][] tasks. + + steps: int, default=100 + Number of thresholds measured. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default="lower left" + Legend for the plot. See the [user guide][parameters] for + an extended description of the choices. + + - If None: No legend is shown. + - If str: Location where to show the legend. + - If dict: Legend configuration. + + figsize: tuple, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as html. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [go.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_calibration + atom.plots:PredictionPlot.plot_confusion_matrix + atom.plots:PredictionPlot.plot_probabilities + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import make_classification + + X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run(["LR", "RF"]) + atom.plot_threshold() + ``` + + """ + check_predict_proba(models, "plot_threshold") + ds = self._get_set(dataset, max_one=True) + target = self.branch._get_target(target, only_columns=True) + + # Get all metric functions from the input + if metric is None: + metrics = [m._score_func for m in self._metric] + else: + metrics = [] + for m in lst(metric): + if isinstance(m, str): + metrics.extend(m.split("+")) + else: + metrics.append(m) + metrics = [get_custom_scorer(m)._score_func for m in metrics] + + fig = self._get_figure() + xaxis, yaxis = BasePlot._fig.get_axes() + + steps = np.linspace(0, 1, steps) + for m in models: + y_true, y_pred = m._get_pred(ds, target, attr="predict_proba") + for met in metrics: + fig.add_trace( + self._draw_line( + x=steps, + y=[met(y_true, y_pred >= step) for step in steps], + parent=m.name, + child=met.__name__, + legend=legend, + xaxis=xaxis, + yaxis=yaxis, + ) + ) + + BasePlot._fig.used_models.extend(models) + return self._plot( + ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"), + xlabel="Threshold", + ylabel="Score", + title=title, + legend=legend, + figsize=figsize, + plotname="plot_threshold", + filename=filename, + display=display, + ) diff --git a/atom/plots/shapplot.py b/atom/plots/shapplot.py new file mode 100644 index 000000000..5c366454c --- /dev/null +++ b/atom/plots/shapplot.py @@ -0,0 +1,866 @@ +# -*- coding: utf-8 -*- + +""" +Automated Tool for Optimized Modelling (ATOM) +Author: Mavs +Description: Module containing the ShapPlot class. + +""" + +from __future__ import annotations + +from importlib.util import find_spec + +import matplotlib.pyplot as plt +import shap +from typeguard import typechecked + +from atom.plots.base import BasePlot +from atom.utils.types import INT, LEGEND, MODEL, SEQUENCE, SLICE +from atom.utils.utils import check_canvas, composed, crash, plot_from_model + + +@typechecked +class ShapPlot(BasePlot): + """Shap plots. + + ATOM wrapper for plots made by the shap package, using Shapley + values for model interpretation. These plots are accessible from + the runners or from the models. Only one model can be plotted at + the same time since the plots are not made by ATOM. + + """ + + @composed(crash, plot_from_model(max_one=True)) + def plot_shap_bar( + self, + models: INT | str | MODEL | None = None, + index: SLICE | None = None, + show: INT | None = None, + target: INT | str | tuple = 1, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> plt.Figure | None: + """Plot SHAP's bar plot. + + Create a bar plot of a set of SHAP values. If a single sample + is passed, then the SHAP values are plotted. If many samples + are passed, then the mean absolute value for each feature + column is plotted. Read more about SHAP plots in the + [user guide][shap]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_shap_bar()`. + + index: int, str, slice, sequence or None, default=None + Rows in the dataset to plot. If None, it selects all rows + in the test set. + + show: int or None, default=None + Number of features (ordered by importance) to show. If + None, it shows all features. + + target: int, str or tuple, default=1 + Class in the target column to target. For multioutput tasks, + the value should be a tuple of the form (column, class). + Note that for binary and multilabel tasks, the selected + class is always the positive one. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of features shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as png. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [plt.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_parshap + atom.plots:ShapPlot.plot_shap_beeswarm + atom.plots:ShapPlot.plot_shap_scatter + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run("LR") + atom.plot_shap_bar(show=10) + ``` + + """ + rows = models.X.loc[models.branch._get_rows(index)] + show = self._get_show(show, models) + target = self.branch._get_target(target) + explanation = models._shap.get_explanation(rows, target) + + self._get_figure(backend="matplotlib") + check_canvas(BasePlot._fig.is_canvas, "plot_shap_bar") + + shap.plots.bar(explanation, max_display=show, show=False) + + BasePlot._fig.used_models.append(models) + return self._plot( + ax=plt.gca(), + xlabel=plt.gca().get_xlabel(), + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + plotname="plot_shap_bar", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(max_one=True)) + def plot_shap_beeswarm( + self, + models: INT | str | MODEL | None = None, + index: slice | SEQUENCE | None = None, + show: INT | None = None, + target: INT | str | tuple = 1, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> plt.Figure | None: + """Plot SHAP's beeswarm plot. + + The plot is colored by feature values. Read more about SHAP + plots in the [user guide][shap]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_shap_beeswarm()`. + + index: tuple, slice or None, default=None + Rows in the dataset to plot. If None, it selects all rows + in the test set. The beeswarm plot does not support plotting + a single sample. + + show: int or None, default=None + Number of features (ordered by importance) to show. If + None, it shows all features. + + target: int, str or tuple, default=1 + Class in the target column to target. For multioutput tasks, + the value should be a tuple of the form (column, class). + Note that for binary and multilabel tasks, the selected + class is always the positive one. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of features shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as png. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [plt.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:PredictionPlot.plot_parshap + atom.plots:ShapPlot.plot_shap_bar + atom.plots:ShapPlot.plot_shap_scatter + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run("LR") + atom.plot_shap_beeswarm(show=10) + ``` + + """ + rows = models.X.loc[models.branch._get_rows(index)] + show = self._get_show(show, models) + target = self.branch._get_target(target) + explanation = models._shap.get_explanation(rows, target) + + self._get_figure(backend="matplotlib") + check_canvas(BasePlot._fig.is_canvas, "plot_shap_beeswarm") + + shap.plots.beeswarm(explanation, max_display=show, show=False) + + BasePlot._fig.used_models.append(models) + return self._plot( + ax=plt.gca(), + xlabel=plt.gca().get_xlabel(), + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(max_one=True)) + def plot_shap_decision( + self, + models: INT | str | MODEL | None = None, + index: SLICE | None = None, + show: INT | None = None, + target: INT | str | tuple = 1, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> plt.Figure | None: + """Plot SHAP's decision plot. + + Visualize model decisions using cumulative SHAP values. Each + plotted line explains a single model prediction. If a single + prediction is plotted, feature values are printed in the + plot (if supplied). If multiple predictions are plotted + together, feature values will not be printed. Plotting too + many predictions together will make the plot unintelligible. + Read more about SHAP plots in the [user guide][shap]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_shap_decision()`. + + index: int, str, slice, sequence or None, default=None + Rows in the dataset to plot. If None, it selects all rows + in the test set. + + show: int or None, default=None + Number of features (ordered by importance) to show. If + None, it shows all features. + + target: int, str or tuple, default=1 + Class in the target column to target. For multioutput tasks, + the value should be a tuple of the form (column, class). + Note that for binary and multilabel tasks, the selected + class is always the positive one. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of features shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as png. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [plt.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:ShapPlot.plot_shap_bar + atom.plots:ShapPlot.plot_shap_beeswarm + atom.plots:ShapPlot.plot_shap_force + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run("LR") + atom.plot_shap_decision(show=10) + atom.plot_shap_decision(index=-1, show=10) + ``` + + """ + rows = models.X.loc[models.branch._get_rows(index)] + show = self._get_show(show, models) + target = self.branch._get_target(target) + explanation = models._shap.get_explanation(rows, target) + + self._get_figure(backend="matplotlib") + check_canvas(BasePlot._fig.is_canvas, "plot_shap_decision") + + shap.decision_plot( + base_value=explanation.base_values, + shap_values=explanation.values, + features=rows, + feature_display_range=slice(-1, -show - 1, -1), + auto_size_plot=False, + show=False, + ) + + BasePlot._fig.used_models.append(models) + return self._plot( + ax=plt.gca(), + xlabel=plt.gca().get_xlabel(), + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + plotname="plot_shap_decision", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(max_one=True)) + def plot_shap_force( + self, + models: INT | str | MODEL | None = None, + index: SLICE | None = None, + target: INT | str | tuple = 1, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] = (900, 300), + filename: str | None = None, + display: bool | None = True, + **kwargs, + ) -> plt.Figure | None: + """Plot SHAP's force plot. + + Visualize the given SHAP values with an additive force layout. + Note that by default this plot will render using javascript. + For a regular figure use `matplotlib=True` (this option is + only available when only a single sample is plotted). Read more + about SHAP plots in the [user guide][shap]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_shap_force()`. + + index: int, str, slice, sequence or None, default=None + Rows in the dataset to plot. If None, it selects all rows + in the test set. + + target: int, str or tuple, default=1 + Class in the target column to target. For multioutput tasks, + the value should be a tuple of the form (column, class). + Note that for binary and multilabel tasks, the selected + class is always the positive one. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=(900, 300) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as png. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + **kwargs + Additional keyword arguments for [shap.plots.force][force]. + + Returns + ------- + [plt.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:ShapPlot.plot_shap_beeswarm + atom.plots:ShapPlot.plot_shap_scatter + atom.plots:ShapPlot.plot_shap_decision + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run("LR") + atom.plot_shap_force(index=-2, matplotlib=True, figsize=(1800, 300)) + ``` + + """ + rows = models.X.loc[models.branch._get_rows(index)] + target = self.branch._get_target(target) + explanation = models._shap.get_explanation(rows, target) + + self._get_figure(create_figure=False, backend="matplotlib") + check_canvas(BasePlot._fig.is_canvas, "plot_shap_force") + + plot = shap.force_plot( + base_value=explanation.base_values, + shap_values=explanation.values, + features=rows, + show=False, + **kwargs, + ) + + if kwargs.get("matplotlib"): + BasePlot._fig.used_models.append(models) + return self._plot( + fig=plt.gcf(), + ax=plt.gca(), + title=title, + legend=legend, + figsize=figsize, + plotname="plot_shap_force", + filename=filename, + display=display, + ) + else: + if filename: # Save to a html file + if not filename.endswith(".html"): + filename += ".html" + shap.save_html(filename, plot) + if display and find_spec("IPython"): + from IPython.display import display + + shap.initjs() + display(plot) + + @composed(crash, plot_from_model(max_one=True)) + def plot_shap_heatmap( + self, + models: INT | str | MODEL | None = None, + index: slice | SEQUENCE | None = None, + show: INT | None = None, + target: INT | str | tuple = 1, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> plt.Figure | None: + """Plot SHAP's heatmap plot. + + This plot is designed to show the population substructure of a + dataset using supervised clustering and a heatmap. Supervised + clustering involves clustering data points not by their original + feature values but by their explanations. Read more about SHAP + plots in the [user guide][shap]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_shap_heatmap()`. + + index: slice, sequence or None, default=None + Rows in the dataset to plot. If None, it selects all rows + in the test set. The plot_shap_heatmap method does not + support plotting a single sample. + + show: int or None, default=None + Number of features (ordered by importance) to show. If + None, it shows all features. + + target: int, str or tuple, default=1 + Class in the target column to target. For multioutput tasks, + the value should be a tuple of the form (column, class). + Note that for binary and multilabel tasks, the selected + class is always the positive one. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of features shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as png. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [plt.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:ShapPlot.plot_shap_decision + atom.plots:ShapPlot.plot_shap_force + atom.plots:ShapPlot.plot_shap_waterfall + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run("LR") + atom.plot_shap_heatmap(show=10) + ``` + + """ + rows = models.X.loc[models.branch._get_rows(index)] + show = self._get_show(show, models) + target = self.branch._get_target(target) + explanation = models._shap.get_explanation(rows, target) + + self._get_figure(backend="matplotlib") + check_canvas(BasePlot._fig.is_canvas, "plot_shap_heatmap") + + shap.plots.heatmap(explanation, max_display=show, show=False) + + BasePlot._fig.used_models.append(models) + return self._plot( + ax=plt.gca(), + xlabel=plt.gca().get_xlabel(), + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + plotname="plot_shap_heatmap", + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(max_one=True)) + def plot_shap_scatter( + self, + models: INT | str | MODEL | None = None, + index: slice | SEQUENCE | None = None, + columns: INT | str = 0, + target: INT | str | tuple = 1, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] = (900, 600), + filename: str | None = None, + display: bool | None = True, + ) -> plt.Figure | None: + """Plot SHAP's scatter plot. + + Plots the value of the feature on the x-axis and the SHAP value + of the same feature on the y-axis. This shows how the model + depends on the given feature, and is like a richer extension of + the classical partial dependence plots. Vertical dispersion of + the data points represents interaction effects. Read more about + SHAP plots in the [user guide][shap]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_shap_scatter()`. + + index: slice, sequence or None, default=None + Rows in the dataset to plot. If None, it selects all rows + in the test set. The plot_shap_scatter method does not + support plotting a single sample. + + columns: int or str, default=0 + Column to plot. + + target: int, str or tuple, default=1 + Class in the target column to target. For multioutput tasks, + the value should be a tuple of the form (column, class). + Note that for binary and multilabel tasks, the selected + class is always the positive one. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=(900, 600) + Figure's size in pixels, format as (x, y). + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as png. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [plt.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:ShapPlot.plot_shap_beeswarm + atom.plots:ShapPlot.plot_shap_decision + atom.plots:ShapPlot.plot_shap_force + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run("LR") + atom.plot_shap_scatter(columns="symmetry error") + ``` + + """ + rows = models.X.loc[models.branch._get_rows(index)] + column = models.branch._get_columns(columns, include_target=False)[0] + target = self.branch._get_target(target) + explanation = models._shap.get_explanation(rows, target) + + # Get explanation for a specific column + explanation = explanation[:, models.columns.get_loc(column)] + + self._get_figure(backend="matplotlib") + check_canvas(BasePlot._fig.is_canvas, "plot_shap_scatter") + + shap.plots.scatter(explanation, color=explanation, ax=plt.gca(), show=False) + + BasePlot._fig.used_models.append(models) + return self._plot( + ax=plt.gca(), + xlabel=plt.gca().get_xlabel(), + ylabel=plt.gca().get_ylabel(), + title=title, + legend=legend, + plotname="plot_shap_scatter", + figsize=figsize, + filename=filename, + display=display, + ) + + @composed(crash, plot_from_model(max_one=True)) + def plot_shap_waterfall( + self, + models: INT | str | MODEL | None = None, + index: INT | str | None = None, + show: INT | None = None, + target: INT | str | tuple = 1, + *, + title: str | dict | None = None, + legend: LEGEND | dict | None = None, + figsize: tuple[INT, INT] | None = None, + filename: str | None = None, + display: bool | None = True, + ) -> plt.Figure | None: + """Plot SHAP's waterfall plot. + + The SHAP value of a feature represents the impact of the + evidence provided by that feature on the model’s output. The + waterfall plot is designed to visually display how the SHAP + values (evidence) of each feature move the model output from + our prior expectation under the background data distribution, + to the final model prediction given the evidence of all the + features. Features are sorted by the magnitude of their SHAP + values with the smallest magnitude features grouped together + at the bottom of the plot when the number of features in the + models exceeds the `show` parameter. Read more about SHAP plots + in the [user guide][shap]. + + Parameters + ---------- + models: int, str, Model or None, default=None + Model to plot. If None, all models are selected. Note that + leaving the default option could raise an exception if there + are multiple models. To avoid this, call the plot directly + from a model, e.g. `atom.lr.plot_shap_waterfall()`. + + index: int, str or None, default=None + Rows in the dataset to plot. If None, it selects all rows + in the test set. The plot_shap_waterfall method does not + support plotting multiple samples. + + show: int or None, default=None + Number of features (ordered by importance) to show. If + None, it shows all features. + + target: int, str or tuple, default=1 + Class in the target column to target. For multioutput tasks, + the value should be a tuple of the form (column, class). + Note that for binary and multilabel tasks, the selected + class is always the positive one. + + title: str, dict or None, default=None + Title for the plot. + + - If None, no title is shown. + - If str, text for the title. + - If dict, [title configuration][parameters]. + + legend: str, dict or None, default=None + Does nothing. Implemented for continuity of the API. + + figsize: tuple or None, default=None + Figure's size in pixels, format as (x, y). If None, it + adapts the size to the number of features shown. + + filename: str or None, default=None + Save the plot using this name. Use "auto" for automatic + naming. The type of the file depends on the provided name + (.html, .png, .pdf, etc...). If `filename` has no file type, + the plot is saved as png. If None, the plot is not saved. + + display: bool or None, default=True + Whether to render the plot. If None, it returns the figure. + + Returns + ------- + [plt.Figure][] or None + Plot object. Only returned if `display=None`. + + See Also + -------- + atom.plots:ShapPlot.plot_shap_bar + atom.plots:ShapPlot.plot_shap_beeswarm + atom.plots:ShapPlot.plot_shap_heatmap + + Examples + -------- + ```pycon + from atom import ATOMClassifier + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True, as_frame=True) + + atom = ATOMClassifier(X, y, random_state=1) + atom.run("LR") + atom.plot_shap_waterfall(show=10) + ``` + + """ + rows = models.X.loc[[models.branch._get_rows(index)[0]]] + show = self._get_show(show, models) + target = self.branch._get_target(target) + explanation = models._shap.get_explanation(rows, target) + + # Waterfall accepts only one row + explanation.values = explanation.values[0] + explanation.data = explanation.data[0] + + self._get_figure(backend="matplotlib") + check_canvas(BasePlot._fig.is_canvas, "plot_shap_waterfall") + + shap.plots.waterfall(explanation, max_display=show, show=False) + + BasePlot._fig.used_models.append(models) + return self._plot( + ax=plt.gca(), + title=title, + legend=legend, + figsize=figsize or (900, 400 + show * 50), + plotname="plot_shap_waterfall", + filename=filename, + display=display, + ) diff --git a/atom/training.py b/atom/training.py index de5af1989..decf3926a 100644 --- a/atom/training.py +++ b/atom/training.py @@ -20,7 +20,8 @@ from atom.basetrainer import BaseTrainer from atom.utils.types import ( - BOOL, ENGINE, GOAL, INT, INT_TYPES, METRIC_SELECTOR, PREDICTOR, SEQUENCE, + BOOL, ENGINE, INT, INT_TYPES, METRIC_SELECTOR, PREDICTOR, SEQUENCE, + WARNINGS, ) from atom.utils.utils import ( ClassMap, composed, crash, get_best_score, infer_task, lst, method_to_log, @@ -341,7 +342,7 @@ class DirectClassifier(Direct): - "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials. n_jobs: int, default=1 @@ -460,12 +461,12 @@ def __init__( engine: ENGINE = {"data": "numpy", "estimator": "sklearn"}, backend: str = "loky", verbose: Literal[0, 1, 2] = 0, - warnings: BOOL | str = False, + warnings: BOOL | WARNINGS = False, logger: str | Logger | None = None, experiment: str | None = None, random_state: INT | None = None, ): - self.goal: GOAL = "class" + self.goal = "class" super().__init__( models, metric, est_params, n_trials, ht_params, n_bootstrap, parallel, errors, n_jobs, device, engine, backend, verbose, @@ -559,7 +560,7 @@ class DirectForecaster(Direct): - "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials. n_jobs: int, default=1 @@ -675,12 +676,12 @@ def __init__( engine: ENGINE = {"data": "numpy", "estimator": "sklearn"}, backend: str = "loky", verbose: Literal[0, 1, 2] = 0, - warnings: BOOL | str = False, + warnings: BOOL | WARNINGS = False, logger: str | Logger | None = None, experiment: str | None = None, random_state: INT | None = None, ): - self.goal: GOAL = "fc" + self.goal = "fc" super().__init__( models, metric, est_params, n_trials, ht_params, n_bootstrap, parallel, errors, n_jobs, device, engine, backend, verbose, warnings, @@ -774,7 +775,7 @@ class DirectRegressor(Direct): - "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials. n_jobs: int, default=1 @@ -886,19 +887,19 @@ def __init__( n_trials: INT | dict | SEQUENCE = 0, ht_params: dict | None = None, n_bootstrap: INT | dict | SEQUENCE = 0, - parallel: bool = False, + parallel: BOOL = False, errors: Literal["raise", "skip", "keep"] = "skip", n_jobs: INT = 1, device: str = "cpu", engine: ENGINE = {"data": "numpy", "estimator": "sklearn"}, backend: str = "loky", verbose: Literal[0, 1, 2] = 0, - warnings: bool | str = False, + warnings: BOOL | str = False, logger: str | Logger | None = None, experiment: str | None = None, random_state: INT | None = None, ): - self.goal: GOAL = "reg" + self.goal = "reg" super().__init__( models, metric, est_params, n_trials, ht_params, n_bootstrap, parallel, errors, n_jobs, device, engine, backend, verbose, warnings, @@ -999,7 +1000,7 @@ class SuccessiveHalvingClassifier(SuccessiveHalving): - "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials. n_jobs: int, default=1 @@ -1112,19 +1113,19 @@ def __init__( n_trials: INT | dict | SEQUENCE = 0, ht_params: dict | None = None, n_bootstrap: INT | dict | SEQUENCE = 0, - parallel: bool = False, + parallel: BOOL = False, errors: Literal["raise", "skip", "keep"] = "skip", n_jobs: INT = 1, device: str = "cpu", engine: ENGINE = {"data": "numpy", "estimator": "sklearn"}, backend: str = "loky", verbose: Literal[0, 1, 2] = 0, - warnings: bool | str = False, + warnings: BOOL | str = False, logger: str | Logger | None = None, experiment: str | None = None, random_state: INT | None = None, ): - self.goal: GOAL = "class" + self.goal = "class" super().__init__( models, metric, skip_runs, est_params, n_trials, ht_params, n_bootstrap, parallel, errors, n_jobs, device, engine, backend, @@ -1221,7 +1222,7 @@ class SuccessiveHalvingForecaster(SuccessiveHalving): - "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials. n_jobs: int, default=1 @@ -1343,7 +1344,7 @@ def __init__( experiment: str | None = None, random_state: INT | None = None, ): - self.goal: GOAL = "fc" + self.goal = "fc" super().__init__( models, metric, skip_runs, est_params, n_trials, ht_params, n_bootstrap, parallel, errors, n_jobs, device, engine, backend, @@ -1440,7 +1441,7 @@ class SuccessiveHalvingRegressor(SuccessiveHalving): - "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials. n_jobs: int, default=1 @@ -1565,7 +1566,7 @@ def __init__( experiment: str | None = None, random_state: INT | None = None, ): - self.goal: GOAL = "reg" + self.goal = "reg" super().__init__( models, metric, skip_runs, est_params, n_trials, ht_params, n_bootstrap, parallel, errors, n_jobs, device, engine, backend, @@ -1671,7 +1672,7 @@ class TrainSizingClassifier(TrainSizing): - "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials. n_jobs: int, default=1 @@ -1796,7 +1797,7 @@ def __init__( experiment: str | None = None, random_state: INT | None = None, ): - self.goal: GOAL = "class" + self.goal = "class" super().__init__( models, metric, train_sizes, est_params, n_trials, ht_params, n_bootstrap, parallel, errors, n_jobs, device, engine, backend, @@ -1898,7 +1899,7 @@ class TrainSizingForecaster(TrainSizing): - "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials. n_jobs: int, default=1 @@ -2020,7 +2021,7 @@ def __init__( experiment: str | None = None, random_state: INT | None = None, ): - self.goal: GOAL = "fc" + self.goal = "fc" super().__init__( models, metric, train_sizes, est_params, n_trials, ht_params, n_bootstrap, parallel, errors, n_jobs, device, engine, backend, @@ -2122,7 +2123,7 @@ class TrainSizingRegressor(TrainSizing): - "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials. n_jobs: int, default=1 @@ -2247,7 +2248,7 @@ def __init__( experiment: str | None = None, random_state: INT | None = None, ): - self.goal: GOAL = "reg" + self.goal = "reg" super().__init__( models, metric, train_sizes, est_params, n_trials, ht_params, n_bootstrap, parallel, errors, n_jobs, device, engine, backend, diff --git a/atom/utils/types.py b/atom/utils/types.py index e124cc5e4..e362619ec 100644 --- a/atom/utils/types.py +++ b/atom/utils/types.py @@ -9,7 +9,9 @@ from __future__ import annotations -from typing import Callable, Literal, Protocol, TypedDict, Union +from typing import ( + Callable, Literal, Protocol, TypedDict, Union, runtime_checkable, +) import modin.pandas as md import numpy as np @@ -55,8 +57,6 @@ FEATURES = Union[iter, dict, list, tuple, np.ndarray, sps.spmatrix, DATAFRAME] TARGET = Union[INT, str, dict, SEQUENCE, DATAFRAME] -BACKEND = Literal["loky", "multiprocessing", "threading", "ray"] - DATASET = Literal[ "dataset", "train", @@ -73,19 +73,40 @@ ] # Selection of rows or columns by name or position -SLICE = Union[INT | str | slice | SEQUENCE] +SLICE = Union[INT, str, slice, SEQUENCE] # Assignment of index or stratify parameter -INDEX_SELECTOR = Union[bool | INT | str | SEQUENCE] +INDEX_SELECTOR = Union[bool, INT, str, SEQUENCE] -# Allowed values for the goal attribute -GOAL = Literal["class", "reg", "fc"] +# Types to initialize a metric +METRIC_SELECTOR = (str, Callable[..., SCALAR], SEQUENCE, None) -# Metric selectors -METRIC_SELECTOR = Union[str, Callable[..., SCALAR], SEQUENCE | None] +# Allowed values for BaseTransformer parameter +BACKEND = Literal["loky", "multiprocessing", "threading", "ray"] +WARNINGS = Literal["default", "error", "ignore", "always", "module", "once"] -# Pruning strategies -PRUNING = Literal["zscore", "iforest", "ee", "lof", "svm", "dbscan", "hdbscan", "optics"] +# Data cleaning parameters +STRAT_NUM = SCALAR | Literal["drop", "mean", "median", "knn", "most_frequent"] +DISCRETIZER_STRATS = Literal["uniform", "quantile", "kmeans", "custom"] +PRUNER_STRATS = Literal[ + "zscore", "iforest", "ee", "lof", "svm", "dbscan", "hdbscan", "optics" +] +SCALER_STRATS = Literal["standard", "minmax", "maxabs", "robust"] + + +# Plotting parameters +LEGEND = Literal[ + "upper left", + "lower left", + "upper right", + "lower right", + "upper center", + "lower center", + "center left", + "center right", + "center", + "out", +] # Classes for type hinting ========================================= >> @@ -96,28 +117,32 @@ class ENGINE(TypedDict, total=False): estimator: Literal["sklearn", "sklearnex", "cuml"] +@runtime_checkable class SCORER(Protocol): """Protocol for all scorers.""" def _score(self, method_caller, clf, X, y, sample_weight=None): ... +@runtime_checkable class TRANSFORMER(Protocol): """Protocol for all predictors.""" - def fit(self, **params): ... def transform(self, **params): ... +@runtime_checkable class PREDICTOR(Protocol): """Protocol for all predictors.""" def fit(self, **params): ... def predict(self, **params): ... +@runtime_checkable class ESTIMATOR(Protocol): """Protocol for all estimators.""" def fit(self, **params): ... +@runtime_checkable class BRANCH(Protocol): """Protocol for the Branch class.""" def _get_rows(self, **params): ... @@ -125,12 +150,14 @@ def _get_columns(self, **params): ... def _get_target(self, **params): ... +@runtime_checkable class MODEL(Protocol): """Protocol for all models.""" - def est_class(self): ... - def get_estimator(self, **params): ... + def _est_class(self): ... + def _get_est(self, **params): ... +@runtime_checkable class RUNNER(Protocol): """Protocol for all runners.""" def run(self, **params): ... diff --git a/atom/utils/utils.py b/atom/utils/utils.py index dce01fcb1..9f2fb715e 100644 --- a/atom/utils/utils.py +++ b/atom/utils/utils.py @@ -14,7 +14,7 @@ import sys import tempfile import warnings -from collections import OrderedDict, deque +from collections import deque from collections.abc import MutableMapping from contextlib import contextmanager from copy import copy, deepcopy @@ -25,10 +25,10 @@ from importlib.util import find_spec from inspect import Parameter, signature from itertools import cycle -from types import GeneratorType +from types import GeneratorType, MappingProxyType from typing import Any, Callable from unittest.mock import patch - +from joblib import Memory import mlflow import modin.pandas as md import numpy as np @@ -54,7 +54,7 @@ BRANCH, DATAFRAME, DATAFRAME_TYPES, ESTIMATOR, FEATURES, FLOAT, INDEX_SELECTOR, INT, INT_TYPES, MODEL, PANDAS, PANDAS_TYPES, PREDICTOR, SCALAR, SCORER, SEQUENCE, SEQUENCE_TYPES, SERIES, SERIES_TYPES, TARGET, - TRANSFORMER, + TRANSFORMER, BOOL ) @@ -118,10 +118,11 @@ def __init__(self, scorer: SCORER, task: str): self.scorer = scorer self.task = task - @staticmethod - def get_final_error(error: FLOAT, weight: FLOAT) -> FLOAT: + def get_final_error(self, error: FLOAT, weight: FLOAT) -> FLOAT: """Returns final value of metric based on error and weight. + Can't be a `staticmethod` because of CatBoost's implementation. + Parameters ---------- error: float @@ -1253,7 +1254,7 @@ def to_rgb(c: str) -> str: return c -def sign(obj: Callable) -> OrderedDict: +def sign(obj: Callable) -> MappingProxyType: """Get the parameters of an object. Parameters @@ -1263,7 +1264,7 @@ def sign(obj: Callable) -> OrderedDict: Returns ------- - OrderedDict + mappingproxy Object's parameters. """ @@ -1315,7 +1316,7 @@ def get_cols(elem: PANDAS) -> list[SERIES]: def variable_return( X: DATAFRAME | None, y: SERIES | None, -) -> DATAFRAME | SERIES | tuple[DATAFRAME, SERIES]: +) -> DATAFRAME | SERIES | tuple[DATAFRAME, PANDAS]: """Return one or two arguments depending on which is None. This utility is used to make methods return only the provided @@ -1326,7 +1327,7 @@ def variable_return( X: dataframe or None Feature set. - y: series or None + y: series, dataframe or None Target column. Returns @@ -1666,7 +1667,10 @@ def to_pyarrow(column: SERIES, inverse: bool = False) -> str: """ if not inverse and not column.dtype.name.endswith("[pyarrow]"): - return f"{column.dtype.name}[pyarrow]" + if column.dtype.name == "object": + return "string[pyarrow]" # pyarrow doesn't support object + else: + return f"{column.dtype.name}[pyarrow]" elif inverse and column.dtype.name.endswith("[pyarrow]"): return column.dtype.name[:-9] @@ -2092,7 +2096,12 @@ def get_feature_importance( return np.abs(data.flatten()) -def export_pipeline(pipeline: pd.Series, model: MODEL | None, memory, verbose) -> Any: +def export_pipeline( + pipeline: pd.Series, + model: MODEL | None = None, + memory: BOOL | str | Memory | None = None, + verbose: INT | None = None, +) -> Any: """Export a pipeline to a sklearn-like object. Optionally, you can add a model as final estimator. @@ -2516,7 +2525,7 @@ def fit_transform_one( y: TARGET | None = None, message: str | None = None, **fit_params, -) -> tuple[DATAFRAME | None, SERIES | None]: +) -> tuple[DATAFRAME | None, SERIES | None, TRANSFORMER]: """Fit and transform the data using one estimator. Parameters @@ -2565,10 +2574,10 @@ def fit_transform_one( def custom_transform( transformer: TRANSFORMER, branch: BRANCH, - data: tuple[DATAFRAME, SERIES] | None = None, + data: tuple[DATAFRAME, PANDAS] | None = None, verbose: int | None = None, method: str = "transform", -) -> tuple[DATAFRAME, SERIES]: +) -> tuple[DATAFRAME, PANDAS]: """Applies a transformer on a branch. This function is generic and should work for all @@ -2600,8 +2609,8 @@ def custom_transform( dataframe Feature set. - series - Target column. + series or dataframe + Target column(s). """ # Select provided data or from the branch diff --git a/docs/404.html b/docs/404.html index 3d4c3e8b2..5ea3a762c 100644 --- a/docs/404.html +++ b/docs/404.html @@ -1147,7 +1147,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1189,7 +1189,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1231,7 +1231,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • diff --git a/docs/API/ATOM/atomclassifier/index.html b/docs/API/ATOM/atomclassifier/index.html index 6220e7e60..8806c1621 100644 --- a/docs/API/ATOM/atomclassifier/index.html +++ b/docs/API/ATOM/atomclassifier/index.html @@ -1288,7 +1288,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1330,7 +1330,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1372,7 +1372,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3392,7 +3392,7 @@

    ATOMClassifier


    -

    class atom.api.ATOMClassifier(*arrays, y=-1, index=False, shuffle=True, stratify=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device="cpu", engine=None, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
    Main class for classification tasks.

    +

    class atom.api.ATOMClassifier(*arrays, y=-1, index=False, shuffle=True, stratify=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
    Main class for classification tasks.

    Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the @@ -3427,7 +3427,6 @@

    ATOMClassifier

    y: int, str, dict, sequence or dataframe, default=-1

    Target column corresponding to X.

    engine: dict or None, default=None
    +

    engine: dict, default={"data": "numpy", "estimator": "sklearn"}
    Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

    +corresponding choice as values. Choose from:

    mapping: dict
    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).
    scaled: bool
    Whether the feature set is scaled.

    +etc...).
    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).
    scaled: bool
    Whether the feature set is scaled.

    A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only -0s and 1s) are excluded from the calculation.

    duplicates: series
    Number of duplicate rows in the dataset.
    missing: list
    Values that are considered "missing".

    +0s and 1s) are excluded from the calculation.
    duplicates: int
    Number of duplicate rows in the dataset.
    missing: list
    Values that are considered "missing".

    These values are used by the clean and impute methods. Default values are: None, NaN, NaT, +inf, -inf, "", "?", "None", "NA", "nan", "NaN", "NaT", "inf". Note that None, NaN, +inf and -inf are always considered -missing since they are incompatible with sklearn estimators.

    nans: series | None
    Columns with the number of missing values in them.
    n_nans: int | None
    Number of samples containing missing values.
    numerical: series
    Names of the numerical features in the dataset.
    n_numerical: int
    Number of numerical features in the dataset.
    categorical: series
    Names of the categorical features in the dataset.
    n_categorical: int
    Number of categorical features in the dataset.
    outliers: series | None
    Columns in training set with amount of outlier values.
    n_outliers: int | None
    Number of samples in the training set containing outliers.
    classes: pd.DataFrame | None
    Distribution of target classes per data set.
    n_classes: int | series | None
    Number of classes in the target column(s).

    +missing since they are incompatible with sklearn estimators.nans: series | None
    Columns with the number of missing values in them.
    n_nans: int | None
    Number of samples containing missing values.
    numerical: index
    Names of the numerical features in the dataset.
    n_numerical: int
    Number of numerical features in the dataset.
    categorical: index
    Names of the categorical features in the dataset.
    n_categorical: int
    Number of categorical features in the dataset.
    outliers: pd.Series | None
    Columns in training set with amount of outlier values.
    n_outliers: int | None
    Number of samples in the training set containing outliers.
    classes: pd.DataFrame | None
    Distribution of target classes per data set.
    n_classes: int | series | None
    Number of classes in the target column(s).


    Utility attributes

    @@ -3669,7 +3667,7 @@

    Plot attributes

    The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

    -

    Attributespalette: str | SEQUENCE
    Color palette.

    +

    Attributespalette: str | sequence
    Color palette.

    Specify one of plotly's built-in palettes or create a custom one, e.g. atom.palette = ["red", "green", "blue"].

    title_fontsize: int
    Fontsize for the plot's title.
    label_fontsize: int
    Fontsize for the labels, legend and hover information.
    tick_fontsize: int
    Fontsize for the ticks along the plot's axes.
    line_width: int
    Width of the line plots.
    marker_size: int
    Size of the markers.
    @@ -3681,7 +3679,7 @@

    Utility methods

    addAdd a transformer to the pipeline.applyApply a function to the dataset.automlSearch for an optimized pipeline in an automated fashion.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the pipeline to a sklearn-like object.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoads an atom instance from a pickle file.logPrint message and save to log file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a .csv file.shrinkConverts the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.


    -

    method add(transformer, columns=None, train_only=False, **fit_params)[source]
    Add a transformer to the pipeline.

    +

    method add(transformer, columns=None, train_only=False, **fit_params)[source]
    Add a transformer to the pipeline.

    If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is @@ -3744,10 +3742,9 @@

    Utility methods



    -

    method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]
    Apply a function to the dataset.

    -

    The function should have signature func(dataset, **kw_args) -> -dataset. This method is useful for stateless transformations -such as taking the log, doing custom scaling, etc...

    +

    method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]
    Apply a function to the dataset.

    +

    This method is useful for stateless transformations such as +taking the log, doing custom scaling, etc...

    Note

    This approach is preferred over changing the dataset directly @@ -3760,7 +3757,8 @@

    Utility methods

    Parametersfunc: callable
    -Function to apply.

    +Function to apply with signature func(dataset, **kw_args) -> +dataset.

    inverse_func: callable or None, default=None
    Inverse function of func. If None, the inverse_transform method returns the input unchanged.

    @@ -3771,7 +3769,7 @@

    Utility methods



    -

    method automl(**kwargs)[source]
    Search for an optimized pipeline in an automated fashion.

    +

    method automl(**kwargs)[source]
    Search for an optimized pipeline in an automated fashion.

    Automated machine learning (AutoML) automates the selection, composition and parameterization of machine learning pipelines. Automating the machine learning often provides faster, more @@ -3793,7 +3791,7 @@

    Utility methods



    -

    method available_models()[source]
    Give an overview of the available predefined models.

    +

    method available_models()[source]
    Give an overview of the available predefined models.

    Returnspd.DataFrame
    Information about the available predefined models. Columns @@ -3815,7 +3813,7 @@

    Utility methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3860,7 +3858,7 @@

    Utility methods



    -

    method clear()[source]
    Reset attributes and clear cache from all models.

    +

    method clear()[source]
    Reset attributes and clear cache from all models.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected @@ -3875,7 +3873,7 @@

    Utility methods

  • Cached holdout data sets


  • -

    method delete(models=None)[source]
    Delete models.

    +

    method delete(models=None)[source]
    Delete models.

    If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3886,7 +3884,7 @@

    Utility methods



    -

    method distribution(distributions=None, columns=None)[source]
    Get statistics on column distributions.

    +

    method distribution(distributions=None, columns=None)[source]
    Get statistics on column distributions.

    Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

    @@ -3916,7 +3914,7 @@

    Utility methods



    -

    method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)[source]
    Create an Exploratory Data Analysis report.

    +

    method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)[source]
    Create an Exploratory Data Analysis report.

    ATOM uses the ydata-profiling package for the EDA. The report is rendered directly in the notebook. The created ProfileReport instance can be accessed through the report @@ -3940,7 +3938,7 @@

    Utility methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    Parametersmetric: str, func, scorer, sequence or None, default=None
    Metric to calculate. If None, it returns an overview of @@ -3967,7 +3965,7 @@

    Utility methods



    -

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    +

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

    @@ -4006,7 +4004,7 @@

    Utility methods



    -

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    +

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -4021,7 +4019,7 @@

    Utility methods



    -

    method get_sample_weight(dataset="train")[source]
    Return sample weights for a balanced data set.

    +

    method get_sample_weight(dataset="train")[source]
    Return sample weights for a balanced data set.

    The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

    @@ -4034,7 +4032,7 @@

    Utility methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -4064,7 +4062,7 @@

    Utility methods



    -

    function atom.atom.load(filename, data=None, transform_data=True, verbose=None)[source]
    Loads an atom instance from a pickle file.

    +

    function atom.atom.load(filename, data=None, transform_data=True, verbose=None)[source]
    Loads an atom instance from a pickle file.

    If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

    @@ -4114,7 +4112,7 @@

    Utility methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -4126,7 +4124,7 @@

    Utility methods



    -

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    +

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -4144,7 +4142,7 @@

    Utility methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4153,7 +4151,7 @@

    Utility methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    @@ -4162,13 +4160,13 @@

    Utility methods



    -

    method reset()[source]
    Reset the instance to it's initial state.

    +

    method reset()[source]
    Reset the instance to it's initial state.

    Deletes all branches and models. The dataset is also reset to its form after initialization.



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -4179,7 +4177,7 @@

    Utility methods



    -

    method save_data(filename="auto", dataset="dataset", **kwargs)[source]
    Save the data in the current branch to a .csv file.

    +

    method save_data(filename="auto", dataset="dataset", **kwargs)[source]
    Save the data in the current branch to a .csv file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -4190,7 +4188,7 @@

    Utility methods



    -

    method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]
    Converts the columns to the smallest possible matching dtype.

    +

    method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]
    Converts the columns to the smallest possible matching dtype.

    Parametersint2bool: bool, default=False
    Whether to convert int columns to bool type. Only if the @@ -4211,7 +4209,7 @@

    Utility methods



    -

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    +

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -4231,18 +4229,18 @@

    Utility methods



    -

    method stats(_vb=-2)[source]
    Display basic information about the dataset.

    +

    method stats(_vb=-2)[source]
    Display basic information about the dataset.

    Parameters_vb: int, default=-2
    Internal parameter to always print if called by user.



    -

    method status()[source]
    Get an overview of the branches and models.

    +

    method status()[source]
    Get an overview of the branches and models.

    This method prints the same information as the __repr__ and also saves it to the logger.



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4272,7 +4270,7 @@

    Utility methods



    -

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    +

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -4305,7 +4303,7 @@

    Data cleaning

    balanceBalance the number of rows per class in the target column.cleanApplies standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.


    -

    method balance(strategy="adasyn", **kwargs)[source]
    Balance the number of rows per class in the target column.

    +

    method balance(strategy="adasyn", **kwargs)[source]
    Balance the number of rows per class in the target column.

    When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the @@ -4326,7 +4324,7 @@

    Data cleaning

    of the target class distribution per data set.



    -

    method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]
    Applies standard data cleaning steps on the dataset.

    +

    method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]
    Applies standard data cleaning steps on the dataset.

    Use the parameters to choose which transformations to perform. The available steps are:

    See the Cleaner class for a description of the parameters.



    -

    method discretize(strategy="quantile", bins=5, labels=None, **kwargs)[source]
    Bin continuous data into intervals.

    +

    method discretize(strategy="quantile", bins=5, labels=None, **kwargs)[source]
    Bin continuous data into intervals.

    For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

    @@ -4351,7 +4349,7 @@

    Data cleaning

    distribution and decide on the bins.



    -

    method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)[source]
    Perform encoding of categorical features.

    +

    method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)[source]
    Perform encoding of categorical features.

    The encoding type depends on the number of classes in the column:



    -

    method delete(models=None)[source]
    Delete models.

    +

    method delete(models=None)[source]
    Delete models.

    If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3861,7 +3860,7 @@

    Utility methods



    -

    method distribution(distributions=None, columns=None)[source]
    Get statistics on column distributions.

    +

    method distribution(distributions=None, columns=None)[source]
    Get statistics on column distributions.

    Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

    @@ -3891,7 +3890,7 @@

    Utility methods



    -

    method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)[source]
    Create an Exploratory Data Analysis report.

    +

    method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)[source]
    Create an Exploratory Data Analysis report.

    ATOM uses the ydata-profiling package for the EDA. The report is rendered directly in the notebook. The created ProfileReport instance can be accessed through the report @@ -3915,7 +3914,7 @@

    Utility methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    Parametersmetric: str, func, scorer, sequence or None, default=None
    Metric to calculate. If None, it returns an overview of @@ -3942,7 +3941,7 @@

    Utility methods



    -

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    +

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

    @@ -3981,7 +3980,7 @@

    Utility methods



    -

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    +

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -3996,7 +3995,7 @@

    Utility methods



    -

    method get_sample_weight(dataset="train")[source]
    Return sample weights for a balanced data set.

    +

    method get_sample_weight(dataset="train")[source]
    Return sample weights for a balanced data set.

    The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

    @@ -4009,7 +4008,7 @@

    Utility methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -4039,7 +4038,7 @@

    Utility methods



    -

    function atom.atom.load(filename, data=None, transform_data=True, verbose=None)[source]
    Loads an atom instance from a pickle file.

    +

    function atom.atom.load(filename, data=None, transform_data=True, verbose=None)[source]
    Loads an atom instance from a pickle file.

    If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

    @@ -4089,7 +4088,7 @@

    Utility methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -4101,7 +4100,7 @@

    Utility methods



    -

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    +

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -4119,7 +4118,7 @@

    Utility methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4128,7 +4127,7 @@

    Utility methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    @@ -4137,13 +4136,13 @@

    Utility methods



    -

    method reset()[source]
    Reset the instance to it's initial state.

    +

    method reset()[source]
    Reset the instance to it's initial state.

    Deletes all branches and models. The dataset is also reset to its form after initialization.



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -4154,7 +4153,7 @@

    Utility methods



    -

    method save_data(filename="auto", dataset="dataset", **kwargs)[source]
    Save the data in the current branch to a .csv file.

    +

    method save_data(filename="auto", dataset="dataset", **kwargs)[source]
    Save the data in the current branch to a .csv file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -4165,7 +4164,7 @@

    Utility methods



    -

    method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]
    Converts the columns to the smallest possible matching dtype.

    +

    method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]
    Converts the columns to the smallest possible matching dtype.

    Parametersint2bool: bool, default=False
    Whether to convert int columns to bool type. Only if the @@ -4186,7 +4185,7 @@

    Utility methods



    -

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    +

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -4206,18 +4205,18 @@

    Utility methods



    -

    method stats(_vb=-2)[source]
    Display basic information about the dataset.

    +

    method stats(_vb=-2)[source]
    Display basic information about the dataset.

    Parameters_vb: int, default=-2
    Internal parameter to always print if called by user.



    -

    method status()[source]
    Get an overview of the branches and models.

    +

    method status()[source]
    Get an overview of the branches and models.

    This method prints the same information as the __repr__ and also saves it to the logger.



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4247,7 +4246,7 @@

    Utility methods



    -

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    +

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -4280,7 +4279,7 @@

    Data cleaning

    cleanApplies standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.


    -

    method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]
    Applies standard data cleaning steps on the dataset.

    +

    method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]
    Applies standard data cleaning steps on the dataset.

    Use the parameters to choose which transformations to perform. The available steps are:

    See the Cleaner class for a description of the parameters.



    -

    method discretize(strategy="quantile", bins=5, labels=None, **kwargs)[source]
    Bin continuous data into intervals.

    +

    method discretize(strategy="quantile", bins=5, labels=None, **kwargs)[source]
    Bin continuous data into intervals.

    For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

    @@ -4305,7 +4304,7 @@

    Data cleaning

    distribution and decide on the bins.



    -

    method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)[source]
    Perform encoding of categorical features.

    +

    method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)[source]
    Perform encoding of categorical features.

    The encoding type depends on the number of classes in the column:



    -

    method delete(models=None)[source]
    Delete models.

    +

    method delete(models=None)[source]
    Delete models.

    If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3874,7 +3873,7 @@

    Utility methods



    -

    method distribution(distributions=None, columns=None)[source]
    Get statistics on column distributions.

    +

    method distribution(distributions=None, columns=None)[source]
    Get statistics on column distributions.

    Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

    @@ -3904,7 +3903,7 @@

    Utility methods



    -

    method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)[source]
    Create an Exploratory Data Analysis report.

    +

    method eda(dataset="dataset", n_rows=None, filename=None, **kwargs)[source]
    Create an Exploratory Data Analysis report.

    ATOM uses the ydata-profiling package for the EDA. The report is rendered directly in the notebook. The created ProfileReport instance can be accessed through the report @@ -3928,7 +3927,7 @@

    Utility methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    Parametersmetric: str, func, scorer, sequence or None, default=None
    Metric to calculate. If None, it returns an overview of @@ -3955,7 +3954,7 @@

    Utility methods



    -

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    +

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

    @@ -3994,7 +3993,7 @@

    Utility methods



    -

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    +

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -4009,7 +4008,7 @@

    Utility methods



    -

    method get_sample_weight(dataset="train")[source]
    Return sample weights for a balanced data set.

    +

    method get_sample_weight(dataset="train")[source]
    Return sample weights for a balanced data set.

    The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

    @@ -4022,7 +4021,7 @@

    Utility methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -4052,7 +4051,7 @@

    Utility methods



    -

    function atom.atom.load(filename, data=None, transform_data=True, verbose=None)[source]
    Loads an atom instance from a pickle file.

    +

    function atom.atom.load(filename, data=None, transform_data=True, verbose=None)[source]
    Loads an atom instance from a pickle file.

    If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

    @@ -4102,7 +4101,7 @@

    Utility methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -4114,7 +4113,7 @@

    Utility methods



    -

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    +

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -4132,7 +4131,7 @@

    Utility methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4141,7 +4140,7 @@

    Utility methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    @@ -4150,13 +4149,13 @@

    Utility methods



    -

    method reset()[source]
    Reset the instance to it's initial state.

    +

    method reset()[source]
    Reset the instance to it's initial state.

    Deletes all branches and models. The dataset is also reset to its form after initialization.



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -4167,7 +4166,7 @@

    Utility methods



    -

    method save_data(filename="auto", dataset="dataset", **kwargs)[source]
    Save the data in the current branch to a .csv file.

    +

    method save_data(filename="auto", dataset="dataset", **kwargs)[source]
    Save the data in the current branch to a .csv file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -4178,7 +4177,7 @@

    Utility methods



    -

    method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]
    Converts the columns to the smallest possible matching dtype.

    +

    method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]
    Converts the columns to the smallest possible matching dtype.

    Parametersint2bool: bool, default=False
    Whether to convert int columns to bool type. Only if the @@ -4199,7 +4198,7 @@

    Utility methods



    -

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    +

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -4219,18 +4218,18 @@

    Utility methods



    -

    method stats(_vb=-2)[source]
    Display basic information about the dataset.

    +

    method stats(_vb=-2)[source]
    Display basic information about the dataset.

    Parameters_vb: int, default=-2
    Internal parameter to always print if called by user.



    -

    method status()[source]
    Get an overview of the branches and models.

    +

    method status()[source]
    Get an overview of the branches and models.

    This method prints the same information as the __repr__ and also saves it to the logger.



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4260,7 +4259,7 @@

    Utility methods



    -

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    +

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -4293,7 +4292,7 @@

    Data cleaning

    cleanApplies standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.


    -

    method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]
    Applies standard data cleaning steps on the dataset.

    +

    method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]
    Applies standard data cleaning steps on the dataset.

    Use the parameters to choose which transformations to perform. The available steps are:

    See the Cleaner class for a description of the parameters.



    -

    method discretize(strategy="quantile", bins=5, labels=None, **kwargs)[source]
    Bin continuous data into intervals.

    +

    method discretize(strategy="quantile", bins=5, labels=None, **kwargs)[source]
    Bin continuous data into intervals.

    For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

    @@ -4318,7 +4317,7 @@

    Data cleaning

    distribution and decide on the bins.



    -

    method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)[source]
    Perform encoding of categorical features.

    +

    method encode(strategy="Target", max_onehot=10, ordinal=None, infrequent_to_value=None, value="rare", **kwargs)[source]
    Perform encoding of categorical features.

    The encoding type depends on the number of classes in the column:

    For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3504,7 +3504,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3515,7 +3515,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3532,7 +3532,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3577,7 +3577,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3592,7 +3592,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3604,7 +3604,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3635,7 +3635,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3649,14 +3649,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3668,7 +3668,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3701,7 +3701,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3736,7 +3736,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3751,7 +3751,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3774,7 +3774,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3786,7 +3786,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3799,7 +3799,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3831,7 +3831,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3843,7 +3843,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3861,14 +3861,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3879,7 +3879,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3898,7 +3898,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3916,9 +3916,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3929,14 +3929,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3949,7 +3949,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3976,7 +3976,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3998,7 +3998,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4030,7 +4030,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4039,7 +4039,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/ard/index.html b/docs/API/models/ard/index.html index d0b441116..7af408572 100644 --- a/docs/API/models/ard/index.html +++ b/docs/API/models/ard/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3367,7 +3367,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3384,16 +3384,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3441,7 +3441,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3452,7 +3452,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3469,7 +3469,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3514,7 +3514,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3529,7 +3529,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3541,7 +3541,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3572,7 +3572,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3586,14 +3586,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3605,7 +3605,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3638,7 +3638,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3673,7 +3673,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3688,7 +3688,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3711,7 +3711,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3723,7 +3723,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3736,7 +3736,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3768,7 +3768,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3780,7 +3780,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3798,14 +3798,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3816,7 +3816,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3835,7 +3835,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3853,9 +3853,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3866,14 +3866,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3886,7 +3886,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3913,7 +3913,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3935,7 +3935,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3967,7 +3967,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3976,7 +3976,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/arima/index.html b/docs/API/models/arima/index.html index 5afadf367..7e93d0354 100644 --- a/docs/API/models/arima/index.html +++ b/docs/API/models/arima/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3386,7 +3386,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3403,16 +3403,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3520,7 +3520,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3531,7 +3531,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3548,7 +3548,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3593,7 +3593,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3608,7 +3608,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3620,7 +3620,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3651,7 +3651,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3665,7 +3665,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3698,7 +3698,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3733,7 +3733,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3748,7 +3748,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3771,7 +3771,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3783,7 +3783,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3796,7 +3796,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3828,7 +3828,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3840,7 +3840,7 @@

    Methods



    -

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3860,7 +3860,7 @@

    Methods



    -

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    +

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

    @@ -3882,7 +3882,7 @@

    Methods



    -

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3903,7 +3903,7 @@

    Methods



    -

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

    @@ -3927,7 +3927,7 @@

    Methods



    -

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    +

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

    @@ -3946,7 +3946,7 @@

    Methods



    -

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

    @@ -3969,7 +3969,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3987,9 +3987,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -4000,14 +4000,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -4039,7 +4039,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4061,7 +4061,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4093,7 +4093,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4102,7 +4102,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/autoarima/index.html b/docs/API/models/autoarima/index.html index a50bb177f..a46e0003d 100644 --- a/docs/API/models/autoarima/index.html +++ b/docs/API/models/autoarima/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3377,7 +3377,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3394,16 +3394,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3511,7 +3511,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3522,7 +3522,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3539,7 +3539,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3584,7 +3584,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3599,7 +3599,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3611,7 +3611,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3642,7 +3642,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3656,7 +3656,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3689,7 +3689,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3724,7 +3724,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3739,7 +3739,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3762,7 +3762,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3774,7 +3774,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3787,7 +3787,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3819,7 +3819,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3831,7 +3831,7 @@

    Methods



    -

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3851,7 +3851,7 @@

    Methods



    -

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    +

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

    @@ -3873,7 +3873,7 @@

    Methods



    -

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3894,7 +3894,7 @@

    Methods



    -

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

    @@ -3918,7 +3918,7 @@

    Methods



    -

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    +

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

    @@ -3937,7 +3937,7 @@

    Methods



    -

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

    @@ -3960,7 +3960,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3978,9 +3978,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3991,14 +3991,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -4030,7 +4030,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4052,7 +4052,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4084,7 +4084,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4093,7 +4093,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/bag/index.html b/docs/API/models/bag/index.html index bfa1815ab..6c8f9a865 100644 --- a/docs/API/models/bag/index.html +++ b/docs/API/models/bag/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3378,7 +3378,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3395,16 +3395,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3506,7 +3506,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3517,7 +3517,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3534,7 +3534,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3579,7 +3579,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3594,7 +3594,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3606,7 +3606,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3637,7 +3637,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3651,14 +3651,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3670,7 +3670,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3703,7 +3703,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3738,7 +3738,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3753,7 +3753,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3776,7 +3776,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3788,7 +3788,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3801,7 +3801,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3833,7 +3833,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3845,7 +3845,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3863,14 +3863,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3881,7 +3881,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3900,7 +3900,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3918,9 +3918,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3931,14 +3931,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3951,7 +3951,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3978,7 +3978,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4000,7 +4000,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4032,7 +4032,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4041,7 +4041,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/bnb/index.html b/docs/API/models/bnb/index.html index f87ba9638..47d3bed13 100644 --- a/docs/API/models/bnb/index.html +++ b/docs/API/models/bnb/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3374,7 +3374,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3391,16 +3391,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3484,7 +3484,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3495,7 +3495,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3512,7 +3512,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3557,7 +3557,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3572,7 +3572,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3584,7 +3584,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3615,7 +3615,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3629,14 +3629,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3648,7 +3648,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3681,7 +3681,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3716,7 +3716,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3731,7 +3731,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3754,7 +3754,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3766,7 +3766,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3779,7 +3779,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3811,7 +3811,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3823,7 +3823,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3841,14 +3841,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3859,7 +3859,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3878,7 +3878,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3896,9 +3896,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3909,14 +3909,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3929,7 +3929,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3956,7 +3956,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3978,7 +3978,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4010,7 +4010,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4019,7 +4019,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/br/index.html b/docs/API/models/br/index.html index 4ada82b23..46dc50213 100644 --- a/docs/API/models/br/index.html +++ b/docs/API/models/br/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3366,7 +3366,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3383,16 +3383,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3440,7 +3440,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3451,7 +3451,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3468,7 +3468,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3513,7 +3513,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3528,7 +3528,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3540,7 +3540,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3571,7 +3571,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3585,14 +3585,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3604,7 +3604,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3637,7 +3637,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3672,7 +3672,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3687,7 +3687,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3710,7 +3710,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3722,7 +3722,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3735,7 +3735,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3767,7 +3767,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3779,7 +3779,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3797,14 +3797,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3815,7 +3815,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3834,7 +3834,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3852,9 +3852,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3865,14 +3865,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3885,7 +3885,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3912,7 +3912,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3934,7 +3934,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3966,7 +3966,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3975,7 +3975,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/catb/index.html b/docs/API/models/catb/index.html index 2f25dfbc8..d94e2ec96 100644 --- a/docs/API/models/catb/index.html +++ b/docs/API/models/catb/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3397,7 +3397,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3414,19 +3414,19 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    Only the scores of the main metric are tracked. Included keys are: train and test. Read more in the -user guide.

    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +user guide.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3510,7 +3510,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3521,7 +3521,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3538,7 +3538,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3583,7 +3583,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3598,7 +3598,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3610,7 +3610,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3641,7 +3641,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3655,14 +3655,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3674,7 +3674,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3707,7 +3707,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3742,7 +3742,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3757,7 +3757,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3780,7 +3780,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3792,7 +3792,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3805,7 +3805,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3837,7 +3837,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3849,7 +3849,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3867,14 +3867,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3885,7 +3885,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3904,7 +3904,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3922,9 +3922,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3935,14 +3935,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3955,7 +3955,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3982,7 +3982,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4004,7 +4004,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4036,7 +4036,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4045,7 +4045,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/catnb/index.html b/docs/API/models/catnb/index.html index 01a528f8e..985e2032a 100644 --- a/docs/API/models/catnb/index.html +++ b/docs/API/models/catnb/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3373,7 +3373,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3390,16 +3390,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3483,7 +3483,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3494,7 +3494,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3511,7 +3511,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3556,7 +3556,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3571,7 +3571,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3583,7 +3583,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3614,7 +3614,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3628,14 +3628,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3647,7 +3647,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3680,7 +3680,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3715,7 +3715,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3730,7 +3730,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3753,7 +3753,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3765,7 +3765,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3778,7 +3778,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3810,7 +3810,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3822,7 +3822,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3840,14 +3840,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3858,7 +3858,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3877,7 +3877,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3895,9 +3895,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3908,14 +3908,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3928,7 +3928,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3955,7 +3955,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3977,7 +3977,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4009,7 +4009,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4018,7 +4018,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/cnb/index.html b/docs/API/models/cnb/index.html index 97289dc21..b635aee53 100644 --- a/docs/API/models/cnb/index.html +++ b/docs/API/models/cnb/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3373,7 +3373,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3390,16 +3390,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3483,7 +3483,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3494,7 +3494,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3511,7 +3511,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3556,7 +3556,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3571,7 +3571,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3583,7 +3583,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3614,7 +3614,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3628,14 +3628,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3647,7 +3647,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3680,7 +3680,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3715,7 +3715,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3730,7 +3730,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3753,7 +3753,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3765,7 +3765,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3778,7 +3778,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3810,7 +3810,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3822,7 +3822,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3840,14 +3840,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3858,7 +3858,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3877,7 +3877,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3895,9 +3895,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3908,14 +3908,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3928,7 +3928,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3955,7 +3955,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3977,7 +3977,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4009,7 +4009,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4018,7 +4018,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/dummy/index.html b/docs/API/models/dummy/index.html index a4867bf8b..18ed14a68 100644 --- a/docs/API/models/dummy/index.html +++ b/docs/API/models/dummy/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3376,7 +3376,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3393,16 +3393,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3486,7 +3486,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3497,7 +3497,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3514,7 +3514,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3559,7 +3559,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3574,7 +3574,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3586,7 +3586,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3617,7 +3617,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3631,14 +3631,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3650,7 +3650,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3683,7 +3683,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3718,7 +3718,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3733,7 +3733,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3756,7 +3756,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3768,7 +3768,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3781,7 +3781,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3813,7 +3813,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3825,7 +3825,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3843,14 +3843,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3861,7 +3861,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3880,7 +3880,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3898,9 +3898,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3911,14 +3911,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3931,7 +3931,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3958,7 +3958,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3980,7 +3980,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4012,7 +4012,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4021,7 +4021,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/en/index.html b/docs/API/models/en/index.html index 443e4c6a1..637b3353d 100644 --- a/docs/API/models/en/index.html +++ b/docs/API/models/en/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3383,7 +3383,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3400,16 +3400,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3457,7 +3457,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3468,7 +3468,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3485,7 +3485,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3530,7 +3530,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3545,7 +3545,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3557,7 +3557,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3588,7 +3588,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3602,14 +3602,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3621,7 +3621,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3654,7 +3654,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3689,7 +3689,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3704,7 +3704,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3727,7 +3727,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3739,7 +3739,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3752,7 +3752,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3784,7 +3784,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3796,7 +3796,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3814,14 +3814,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3832,7 +3832,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3851,7 +3851,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3869,9 +3869,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3882,14 +3882,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3902,7 +3902,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3929,7 +3929,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3951,7 +3951,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3983,7 +3983,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3992,7 +3992,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/es/index.html b/docs/API/models/es/index.html index b6e048495..1e2c0148b 100644 --- a/docs/API/models/es/index.html +++ b/docs/API/models/es/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3365,7 +3365,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3382,16 +3382,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3454,7 +3454,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3465,7 +3465,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3482,7 +3482,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3527,7 +3527,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3542,7 +3542,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3554,7 +3554,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3585,7 +3585,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3599,7 +3599,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3632,7 +3632,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3667,7 +3667,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3682,7 +3682,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3705,7 +3705,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3717,7 +3717,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3730,7 +3730,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3762,7 +3762,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3774,7 +3774,7 @@

    Methods



    -

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3794,7 +3794,7 @@

    Methods



    -

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    +

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

    @@ -3816,7 +3816,7 @@

    Methods



    -

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3837,7 +3837,7 @@

    Methods



    -

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

    @@ -3861,7 +3861,7 @@

    Methods



    -

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    +

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

    @@ -3880,7 +3880,7 @@

    Methods



    -

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

    @@ -3903,7 +3903,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3921,9 +3921,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3934,14 +3934,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3973,7 +3973,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3995,7 +3995,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4027,7 +4027,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4036,7 +4036,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/et/index.html b/docs/API/models/et/index.html index 0dbc810e8..c6f9a81d4 100644 --- a/docs/API/models/et/index.html +++ b/docs/API/models/et/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3375,7 +3375,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3392,16 +3392,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3485,7 +3485,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3496,7 +3496,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3513,7 +3513,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3558,7 +3558,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3573,7 +3573,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3585,7 +3585,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3616,7 +3616,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3630,14 +3630,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3649,7 +3649,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3682,7 +3682,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3717,7 +3717,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3732,7 +3732,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3755,7 +3755,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3767,7 +3767,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3780,7 +3780,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3812,7 +3812,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3824,7 +3824,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3842,14 +3842,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3860,7 +3860,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3879,7 +3879,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3897,9 +3897,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3910,14 +3910,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3930,7 +3930,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3957,7 +3957,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3979,7 +3979,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4011,7 +4011,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4020,7 +4020,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/etree/index.html b/docs/API/models/etree/index.html index 9513ea8ff..96312fd99 100644 --- a/docs/API/models/etree/index.html +++ b/docs/API/models/etree/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3377,7 +3377,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3394,16 +3394,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3487,7 +3487,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3498,7 +3498,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3515,7 +3515,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3560,7 +3560,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3575,7 +3575,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3587,7 +3587,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3618,7 +3618,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3632,14 +3632,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3651,7 +3651,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3684,7 +3684,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3719,7 +3719,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3734,7 +3734,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3757,7 +3757,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3769,7 +3769,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3782,7 +3782,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3814,7 +3814,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3826,7 +3826,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3844,14 +3844,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3862,7 +3862,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3881,7 +3881,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3899,9 +3899,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3912,14 +3912,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3932,7 +3932,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3959,7 +3959,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3981,7 +3981,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4013,7 +4013,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4022,7 +4022,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/ets/index.html b/docs/API/models/ets/index.html index c9f5fbc88..b1cc1f935 100644 --- a/docs/API/models/ets/index.html +++ b/docs/API/models/ets/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3366,7 +3366,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3383,16 +3383,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3500,7 +3500,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3511,7 +3511,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3528,7 +3528,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3573,7 +3573,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3588,7 +3588,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3600,7 +3600,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3631,7 +3631,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3645,7 +3645,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3678,7 +3678,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3713,7 +3713,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3728,7 +3728,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3751,7 +3751,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3763,7 +3763,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3776,7 +3776,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3808,7 +3808,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3820,7 +3820,7 @@

    Methods



    -

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3840,7 +3840,7 @@

    Methods



    -

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    +

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

    @@ -3862,7 +3862,7 @@

    Methods



    -

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3883,7 +3883,7 @@

    Methods



    -

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

    @@ -3907,7 +3907,7 @@

    Methods



    -

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    +

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

    @@ -3926,7 +3926,7 @@

    Methods



    -

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

    @@ -3949,7 +3949,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3967,9 +3967,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3980,14 +3980,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -4019,7 +4019,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4041,7 +4041,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4073,7 +4073,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4082,7 +4082,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/gbm/index.html b/docs/API/models/gbm/index.html index b9600280a..8af6701ca 100644 --- a/docs/API/models/gbm/index.html +++ b/docs/API/models/gbm/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3382,7 +3382,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3399,16 +3399,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3510,7 +3510,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3521,7 +3521,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3538,7 +3538,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3583,7 +3583,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3598,7 +3598,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3610,7 +3610,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3641,7 +3641,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3655,14 +3655,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3674,7 +3674,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3707,7 +3707,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3742,7 +3742,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3757,7 +3757,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3780,7 +3780,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3792,7 +3792,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3805,7 +3805,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3837,7 +3837,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3849,7 +3849,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3867,14 +3867,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3885,7 +3885,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3904,7 +3904,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3922,9 +3922,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3935,14 +3935,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3955,7 +3955,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3982,7 +3982,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4004,7 +4004,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4036,7 +4036,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4045,7 +4045,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/gnb/index.html b/docs/API/models/gnb/index.html index 2e1383ffc..a9eb8b207 100644 --- a/docs/API/models/gnb/index.html +++ b/docs/API/models/gnb/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3346,7 +3346,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3363,16 +3363,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3456,7 +3456,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3467,7 +3467,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3484,7 +3484,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3529,7 +3529,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3544,7 +3544,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3556,7 +3556,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3587,7 +3587,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3601,14 +3601,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3620,7 +3620,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3653,7 +3653,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3688,7 +3688,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3703,7 +3703,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3726,7 +3726,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3738,7 +3738,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3751,7 +3751,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3783,7 +3783,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3795,7 +3795,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3813,14 +3813,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3831,7 +3831,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3850,7 +3850,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3868,9 +3868,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3881,14 +3881,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3901,7 +3901,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3928,7 +3928,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3950,7 +3950,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3982,7 +3982,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3991,7 +3991,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/gp/index.html b/docs/API/models/gp/index.html index de806ce5e..c58766fa9 100644 --- a/docs/API/models/gp/index.html +++ b/docs/API/models/gp/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3361,7 +3361,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3378,16 +3378,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3471,7 +3471,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3482,7 +3482,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3499,7 +3499,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3544,7 +3544,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3559,7 +3559,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3571,7 +3571,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3602,7 +3602,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3616,14 +3616,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3635,7 +3635,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3668,7 +3668,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3703,7 +3703,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3718,7 +3718,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3741,7 +3741,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3753,7 +3753,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3766,7 +3766,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3798,7 +3798,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3810,7 +3810,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3828,14 +3828,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3846,7 +3846,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3865,7 +3865,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3883,9 +3883,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3896,14 +3896,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3916,7 +3916,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3943,7 +3943,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3965,7 +3965,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3997,7 +3997,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4006,7 +4006,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/hgbm/index.html b/docs/API/models/hgbm/index.html index 830781cb2..70613ee49 100644 --- a/docs/API/models/hgbm/index.html +++ b/docs/API/models/hgbm/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3378,7 +3378,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3395,16 +3395,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3506,7 +3506,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3517,7 +3517,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3534,7 +3534,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3579,7 +3579,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3594,7 +3594,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3606,7 +3606,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3637,7 +3637,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3651,14 +3651,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3670,7 +3670,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3703,7 +3703,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3738,7 +3738,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3753,7 +3753,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3776,7 +3776,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3788,7 +3788,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3801,7 +3801,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3833,7 +3833,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3845,7 +3845,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3863,14 +3863,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3881,7 +3881,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3900,7 +3900,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3918,9 +3918,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3931,14 +3931,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3951,7 +3951,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3978,7 +3978,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4000,7 +4000,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4032,7 +4032,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4041,7 +4041,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/huber/index.html b/docs/API/models/huber/index.html index d94083322..49be8e229 100644 --- a/docs/API/models/huber/index.html +++ b/docs/API/models/huber/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3366,7 +3366,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3383,16 +3383,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3440,7 +3440,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3451,7 +3451,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3468,7 +3468,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3513,7 +3513,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3528,7 +3528,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3540,7 +3540,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3571,7 +3571,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3585,14 +3585,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3604,7 +3604,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3637,7 +3637,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3672,7 +3672,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3687,7 +3687,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3710,7 +3710,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3722,7 +3722,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3735,7 +3735,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3767,7 +3767,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3779,7 +3779,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3797,14 +3797,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3815,7 +3815,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3834,7 +3834,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3852,9 +3852,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3865,14 +3865,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3885,7 +3885,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3912,7 +3912,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3934,7 +3934,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3966,7 +3966,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3975,7 +3975,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/knn/index.html b/docs/API/models/knn/index.html index 6dc9fb717..a35aad23d 100644 --- a/docs/API/models/knn/index.html +++ b/docs/API/models/knn/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3417,7 +3417,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3434,16 +3434,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3527,7 +3527,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3538,7 +3538,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3555,7 +3555,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3600,7 +3600,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3615,7 +3615,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3627,7 +3627,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3658,7 +3658,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3672,14 +3672,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3691,7 +3691,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3724,7 +3724,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3759,7 +3759,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3774,7 +3774,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3797,7 +3797,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3809,7 +3809,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3822,7 +3822,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3854,7 +3854,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3866,7 +3866,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3884,14 +3884,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3902,7 +3902,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3921,7 +3921,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3939,9 +3939,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3952,14 +3952,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3972,7 +3972,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3999,7 +3999,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4021,7 +4021,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4053,7 +4053,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4062,7 +4062,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/lars/index.html b/docs/API/models/lars/index.html index 589bd4f51..ff200436c 100644 --- a/docs/API/models/lars/index.html +++ b/docs/API/models/lars/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3349,7 +3349,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3366,16 +3366,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3423,7 +3423,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3434,7 +3434,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3451,7 +3451,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3496,7 +3496,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3511,7 +3511,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3523,7 +3523,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3554,7 +3554,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3568,14 +3568,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3587,7 +3587,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3620,7 +3620,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3655,7 +3655,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3670,7 +3670,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3693,7 +3693,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3705,7 +3705,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3718,7 +3718,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3750,7 +3750,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3762,7 +3762,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3780,14 +3780,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3798,7 +3798,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3817,7 +3817,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3835,9 +3835,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3848,14 +3848,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3868,7 +3868,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3895,7 +3895,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3917,7 +3917,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3949,7 +3949,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3958,7 +3958,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/lasso/index.html b/docs/API/models/lasso/index.html index ebc6d8334..f886efadc 100644 --- a/docs/API/models/lasso/index.html +++ b/docs/API/models/lasso/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3383,7 +3383,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3400,16 +3400,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3457,7 +3457,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3468,7 +3468,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3485,7 +3485,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3530,7 +3530,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3545,7 +3545,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3557,7 +3557,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3588,7 +3588,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3602,14 +3602,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3621,7 +3621,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3654,7 +3654,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3689,7 +3689,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3704,7 +3704,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3727,7 +3727,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3739,7 +3739,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3752,7 +3752,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3784,7 +3784,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3796,7 +3796,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3814,14 +3814,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3832,7 +3832,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3851,7 +3851,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3869,9 +3869,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3882,14 +3882,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3902,7 +3902,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3929,7 +3929,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3951,7 +3951,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3983,7 +3983,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3992,7 +3992,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/lda/index.html b/docs/API/models/lda/index.html index 790debeb2..f9cc33f34 100644 --- a/docs/API/models/lda/index.html +++ b/docs/API/models/lda/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3368,7 +3368,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3385,16 +3385,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3496,7 +3496,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3507,7 +3507,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3524,7 +3524,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3569,7 +3569,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3584,7 +3584,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3596,7 +3596,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3627,7 +3627,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3641,14 +3641,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3660,7 +3660,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3693,7 +3693,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3728,7 +3728,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3743,7 +3743,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3766,7 +3766,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3778,7 +3778,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3791,7 +3791,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3823,7 +3823,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3835,7 +3835,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3853,14 +3853,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3871,7 +3871,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3890,7 +3890,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3908,9 +3908,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3921,14 +3921,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3941,7 +3941,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3968,7 +3968,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3990,7 +3990,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4022,7 +4022,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4031,7 +4031,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/lgb/index.html b/docs/API/models/lgb/index.html index de339ac8e..53d32239f 100644 --- a/docs/API/models/lgb/index.html +++ b/docs/API/models/lgb/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3385,7 +3385,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3402,19 +3402,19 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    Only the scores of the main metric are tracked. Included keys are: train and test. Read more in the -user guide.

    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +user guide.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3516,7 +3516,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3527,7 +3527,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3544,7 +3544,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3589,7 +3589,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3604,7 +3604,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3616,7 +3616,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3647,7 +3647,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3661,14 +3661,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3680,7 +3680,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3713,7 +3713,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3748,7 +3748,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3763,7 +3763,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3786,7 +3786,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3798,7 +3798,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3811,7 +3811,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3843,7 +3843,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3855,7 +3855,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3873,14 +3873,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3891,7 +3891,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3910,7 +3910,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3928,9 +3928,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3941,14 +3941,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3961,7 +3961,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3988,7 +3988,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4010,7 +4010,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4042,7 +4042,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4051,7 +4051,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/lr/index.html b/docs/API/models/lr/index.html index 1aaf9af85..9ec451439 100644 --- a/docs/API/models/lr/index.html +++ b/docs/API/models/lr/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3388,7 +3388,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3405,16 +3405,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3516,7 +3516,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3527,7 +3527,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3544,7 +3544,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3589,7 +3589,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3604,7 +3604,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3616,7 +3616,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3647,7 +3647,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3661,14 +3661,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3680,7 +3680,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3713,7 +3713,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3748,7 +3748,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3763,7 +3763,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3786,7 +3786,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3798,7 +3798,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3811,7 +3811,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3843,7 +3843,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3855,7 +3855,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3873,14 +3873,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3891,7 +3891,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3910,7 +3910,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3928,9 +3928,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3941,14 +3941,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3961,7 +3961,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3988,7 +3988,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4010,7 +4010,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4042,7 +4042,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4051,7 +4051,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/lsvm/index.html b/docs/API/models/lsvm/index.html index f7dd805b1..0a81bc6d0 100644 --- a/docs/API/models/lsvm/index.html +++ b/docs/API/models/lsvm/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3393,7 +3393,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3410,16 +3410,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3485,7 +3485,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3496,7 +3496,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3513,7 +3513,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3558,7 +3558,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3573,7 +3573,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3585,7 +3585,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3616,7 +3616,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3630,14 +3630,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3649,7 +3649,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3682,7 +3682,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3717,7 +3717,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3732,7 +3732,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3755,7 +3755,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3767,7 +3767,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3780,7 +3780,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3812,7 +3812,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3824,7 +3824,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3842,14 +3842,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3860,7 +3860,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3879,7 +3879,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3897,9 +3897,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3910,14 +3910,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3930,7 +3930,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3957,7 +3957,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3979,7 +3979,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4011,7 +4011,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4020,7 +4020,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/mlp/index.html b/docs/API/models/mlp/index.html index cac85ea8b..37c3f0eb5 100644 --- a/docs/API/models/mlp/index.html +++ b/docs/API/models/mlp/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3377,7 +3377,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3394,19 +3394,19 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    Only the scores of the main metric are tracked. Included keys are: train and test. Read more in the -user guide.

    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +user guide.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3490,7 +3490,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3501,7 +3501,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3518,7 +3518,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3563,7 +3563,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3578,7 +3578,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3590,7 +3590,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3621,7 +3621,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3635,14 +3635,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3654,7 +3654,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3687,7 +3687,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3722,7 +3722,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3737,7 +3737,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3760,7 +3760,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3772,7 +3772,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3785,7 +3785,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3817,7 +3817,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3829,7 +3829,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3847,14 +3847,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3865,7 +3865,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3884,7 +3884,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3902,9 +3902,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3915,14 +3915,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3935,7 +3935,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3962,7 +3962,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3984,7 +3984,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4016,7 +4016,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4025,7 +4025,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/mnb/index.html b/docs/API/models/mnb/index.html index 3dd541f02..f0cf3b024 100644 --- a/docs/API/models/mnb/index.html +++ b/docs/API/models/mnb/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3375,7 +3375,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3392,16 +3392,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3485,7 +3485,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3496,7 +3496,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3513,7 +3513,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3558,7 +3558,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3573,7 +3573,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3585,7 +3585,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3616,7 +3616,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3630,14 +3630,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3649,7 +3649,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3682,7 +3682,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3717,7 +3717,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3732,7 +3732,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3755,7 +3755,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3767,7 +3767,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3780,7 +3780,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3812,7 +3812,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3824,7 +3824,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3842,14 +3842,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3860,7 +3860,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3879,7 +3879,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3897,9 +3897,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3910,14 +3910,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3930,7 +3930,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3957,7 +3957,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3979,7 +3979,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4011,7 +4011,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4020,7 +4020,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/nf/index.html b/docs/API/models/nf/index.html index c3c69420e..b9f5ca054 100644 --- a/docs/API/models/nf/index.html +++ b/docs/API/models/nf/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3366,7 +3366,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3383,16 +3383,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3500,7 +3500,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3511,7 +3511,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3528,7 +3528,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3573,7 +3573,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3588,7 +3588,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3600,7 +3600,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3631,7 +3631,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3645,7 +3645,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3678,7 +3678,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3713,7 +3713,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3728,7 +3728,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3751,7 +3751,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3763,7 +3763,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3776,7 +3776,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3808,7 +3808,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3820,7 +3820,7 @@

    Methods



    -

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3840,7 +3840,7 @@

    Methods



    -

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    +

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

    @@ -3862,7 +3862,7 @@

    Methods



    -

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3883,7 +3883,7 @@

    Methods



    -

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

    @@ -3907,7 +3907,7 @@

    Methods



    -

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    +

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

    @@ -3926,7 +3926,7 @@

    Methods



    -

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

    @@ -3949,7 +3949,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3967,9 +3967,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3980,14 +3980,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -4019,7 +4019,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4041,7 +4041,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4073,7 +4073,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4082,7 +4082,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/ols/index.html b/docs/API/models/ols/index.html index 79d871235..c56424e08 100644 --- a/docs/API/models/ols/index.html +++ b/docs/API/models/ols/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3348,7 +3348,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3365,16 +3365,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3422,7 +3422,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3433,7 +3433,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3450,7 +3450,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3495,7 +3495,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3510,7 +3510,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3522,7 +3522,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3553,7 +3553,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3567,14 +3567,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3586,7 +3586,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3619,7 +3619,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3654,7 +3654,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3669,7 +3669,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3692,7 +3692,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3704,7 +3704,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3717,7 +3717,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3749,7 +3749,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3761,7 +3761,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3779,14 +3779,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3797,7 +3797,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3816,7 +3816,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3834,9 +3834,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3847,14 +3847,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3867,7 +3867,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3894,7 +3894,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3916,7 +3916,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3948,7 +3948,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3957,7 +3957,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/omp/index.html b/docs/API/models/omp/index.html index f0a425511..bfcc4bb3f 100644 --- a/docs/API/models/omp/index.html +++ b/docs/API/models/omp/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3346,7 +3346,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3363,16 +3363,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3420,7 +3420,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3431,7 +3431,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3448,7 +3448,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3493,7 +3493,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3508,7 +3508,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3520,7 +3520,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3551,7 +3551,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3565,14 +3565,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3584,7 +3584,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3617,7 +3617,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3652,7 +3652,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3667,7 +3667,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3690,7 +3690,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3702,7 +3702,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3715,7 +3715,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3747,7 +3747,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3759,7 +3759,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3777,14 +3777,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3795,7 +3795,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3814,7 +3814,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3832,9 +3832,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3845,14 +3845,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3865,7 +3865,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3892,7 +3892,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3914,7 +3914,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3946,7 +3946,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3955,7 +3955,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/pa/index.html b/docs/API/models/pa/index.html index 36d8589ee..9014549f2 100644 --- a/docs/API/models/pa/index.html +++ b/docs/API/models/pa/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3375,7 +3375,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3392,19 +3392,19 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    Only the scores of the main metric are tracked. Included keys are: train and test. Read more in the -user guide.

    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +user guide.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3470,7 +3470,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3481,7 +3481,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3498,7 +3498,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3543,7 +3543,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3558,7 +3558,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3570,7 +3570,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3601,7 +3601,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3615,14 +3615,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3634,7 +3634,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3667,7 +3667,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3702,7 +3702,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3717,7 +3717,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3740,7 +3740,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3752,7 +3752,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3765,7 +3765,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3797,7 +3797,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3809,7 +3809,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3827,14 +3827,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3845,7 +3845,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3864,7 +3864,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3882,9 +3882,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3895,14 +3895,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3915,7 +3915,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3942,7 +3942,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3964,7 +3964,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3996,7 +3996,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4005,7 +4005,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/perc/index.html b/docs/API/models/perc/index.html index 6c78dfcbd..32476e169 100644 --- a/docs/API/models/perc/index.html +++ b/docs/API/models/perc/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3373,7 +3373,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3390,19 +3390,19 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    Only the scores of the main metric are tracked. Included keys are: train and test. Read more in the -user guide.

    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +user guide.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3468,7 +3468,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3479,7 +3479,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3496,7 +3496,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3541,7 +3541,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3556,7 +3556,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3568,7 +3568,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3599,7 +3599,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3613,14 +3613,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3632,7 +3632,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3665,7 +3665,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3700,7 +3700,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3715,7 +3715,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3738,7 +3738,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3750,7 +3750,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3763,7 +3763,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3795,7 +3795,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3807,7 +3807,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3825,14 +3825,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3843,7 +3843,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3862,7 +3862,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3880,9 +3880,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3893,14 +3893,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3913,7 +3913,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3940,7 +3940,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3962,7 +3962,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -3994,7 +3994,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4003,7 +4003,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/pt/index.html b/docs/API/models/pt/index.html index 2af53fa01..e4f97cbc1 100644 --- a/docs/API/models/pt/index.html +++ b/docs/API/models/pt/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3365,7 +3365,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3382,16 +3382,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3499,7 +3499,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3510,7 +3510,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3527,7 +3527,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3572,7 +3572,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3587,7 +3587,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3599,7 +3599,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3630,7 +3630,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3644,7 +3644,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3677,7 +3677,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3712,7 +3712,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3727,7 +3727,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3750,7 +3750,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3762,7 +3762,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3775,7 +3775,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3807,7 +3807,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3819,7 +3819,7 @@

    Methods



    -

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(fh, X=None, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3839,7 +3839,7 @@

    Methods



    -

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    +

    method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]
    Get prediction intervals on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_interval method.

    @@ -3861,7 +3861,7 @@

    Methods



    -

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_proba(fh, X=None, marginal=True, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3882,7 +3882,7 @@

    Methods



    -

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_quantiles method.

    @@ -3906,7 +3906,7 @@

    Methods



    -

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    +

    method predict_residuals(y, X=None, verbose=None)[source]
    Get residuals of forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_residuals method.

    @@ -3925,7 +3925,7 @@

    Methods



    -

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    +

    method predict_var(fh, X=None, cov=False, verbose=None)[source]
    Get probabilistic forecasts on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_var method.

    @@ -3948,7 +3948,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3966,9 +3966,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3979,14 +3979,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(y, X=None, fh=None, metric=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -4018,7 +4018,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4040,7 +4040,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4072,7 +4072,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4081,7 +4081,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/qda/index.html b/docs/API/models/qda/index.html index ec6d677a0..6e8e23f15 100644 --- a/docs/API/models/qda/index.html +++ b/docs/API/models/qda/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3368,7 +3368,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3385,16 +3385,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3496,7 +3496,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3507,7 +3507,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3524,7 +3524,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3569,7 +3569,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3584,7 +3584,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3596,7 +3596,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3627,7 +3627,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3641,14 +3641,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3660,7 +3660,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3693,7 +3693,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3728,7 +3728,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3743,7 +3743,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3766,7 +3766,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3778,7 +3778,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3791,7 +3791,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3823,7 +3823,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3835,7 +3835,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3853,14 +3853,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3871,7 +3871,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3890,7 +3890,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3908,9 +3908,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3921,14 +3921,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3941,7 +3941,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3968,7 +3968,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3990,7 +3990,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4022,7 +4022,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4031,7 +4031,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/rf/index.html b/docs/API/models/rf/index.html index 74d5fff0f..d3fd296d9 100644 --- a/docs/API/models/rf/index.html +++ b/docs/API/models/rf/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3426,7 +3426,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3443,16 +3443,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3536,7 +3536,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3547,7 +3547,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3564,7 +3564,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3609,7 +3609,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3624,7 +3624,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3636,7 +3636,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3667,7 +3667,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3681,14 +3681,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3700,7 +3700,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3733,7 +3733,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3768,7 +3768,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3783,7 +3783,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3806,7 +3806,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3818,7 +3818,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3831,7 +3831,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3863,7 +3863,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3875,7 +3875,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3893,14 +3893,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3911,7 +3911,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3930,7 +3930,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3948,9 +3948,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3961,14 +3961,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3981,7 +3981,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -4008,7 +4008,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4030,7 +4030,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4062,7 +4062,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4071,7 +4071,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/ridge/index.html b/docs/API/models/ridge/index.html index f81248432..de73780d9 100644 --- a/docs/API/models/ridge/index.html +++ b/docs/API/models/ridge/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3420,7 +3420,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3437,16 +3437,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3530,7 +3530,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3541,7 +3541,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3558,7 +3558,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3603,7 +3603,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3618,7 +3618,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3630,7 +3630,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3661,7 +3661,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3675,14 +3675,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3694,7 +3694,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3727,7 +3727,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3762,7 +3762,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3777,7 +3777,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3800,7 +3800,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3812,7 +3812,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3825,7 +3825,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3857,7 +3857,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3869,7 +3869,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3887,14 +3887,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3905,7 +3905,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3924,7 +3924,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3942,9 +3942,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3955,14 +3955,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3975,7 +3975,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -4002,7 +4002,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4024,7 +4024,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4056,7 +4056,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4065,7 +4065,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/rnn/index.html b/docs/API/models/rnn/index.html index eb97d8567..9d1afdfe3 100644 --- a/docs/API/models/rnn/index.html +++ b/docs/API/models/rnn/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3389,7 +3389,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3406,16 +3406,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3499,7 +3499,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3510,7 +3510,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3527,7 +3527,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3572,7 +3572,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3587,7 +3587,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3599,7 +3599,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3630,7 +3630,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3644,14 +3644,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3663,7 +3663,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3696,7 +3696,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3731,7 +3731,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3746,7 +3746,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3769,7 +3769,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3781,7 +3781,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3794,7 +3794,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3826,7 +3826,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3838,7 +3838,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3856,14 +3856,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3874,7 +3874,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3893,7 +3893,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3911,9 +3911,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3924,14 +3924,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3944,7 +3944,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3971,7 +3971,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3993,7 +3993,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4025,7 +4025,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4034,7 +4034,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/sgd/index.html b/docs/API/models/sgd/index.html index e8e56f918..5332da139 100644 --- a/docs/API/models/sgd/index.html +++ b/docs/API/models/sgd/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3376,7 +3376,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3393,19 +3393,19 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    Only the scores of the main metric are tracked. Included keys are: train and test. Read more in the -user guide.

    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +user guide.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3507,7 +3507,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3518,7 +3518,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3535,7 +3535,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3580,7 +3580,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3595,7 +3595,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3607,7 +3607,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3638,7 +3638,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3652,14 +3652,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3671,7 +3671,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3704,7 +3704,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3739,7 +3739,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3754,7 +3754,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3777,7 +3777,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3789,7 +3789,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3802,7 +3802,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3834,7 +3834,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3846,7 +3846,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3864,14 +3864,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3882,7 +3882,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3901,7 +3901,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3919,9 +3919,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3932,14 +3932,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3952,7 +3952,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3979,7 +3979,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4001,7 +4001,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4033,7 +4033,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4042,7 +4042,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/svm/index.html b/docs/API/models/svm/index.html index 0c9ba11cc..2a5e8d06f 100644 --- a/docs/API/models/svm/index.html +++ b/docs/API/models/svm/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3418,7 +3418,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3435,16 +3435,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3510,7 +3510,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3521,7 +3521,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3538,7 +3538,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3583,7 +3583,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3598,7 +3598,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3610,7 +3610,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3641,7 +3641,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3655,14 +3655,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3674,7 +3674,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3707,7 +3707,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3742,7 +3742,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3757,7 +3757,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3780,7 +3780,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3792,7 +3792,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3805,7 +3805,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3837,7 +3837,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3849,7 +3849,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3867,14 +3867,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3885,7 +3885,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3904,7 +3904,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3922,9 +3922,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3935,14 +3935,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3955,7 +3955,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3982,7 +3982,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -4004,7 +4004,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4036,7 +4036,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4045,7 +4045,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/tree/index.html b/docs/API/models/tree/index.html index a29a2828b..8504f05ab 100644 --- a/docs/API/models/tree/index.html +++ b/docs/API/models/tree/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3372,7 +3372,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3389,16 +3389,16 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3482,7 +3482,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3493,7 +3493,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3510,7 +3510,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3555,7 +3555,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3570,7 +3570,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3582,7 +3582,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3613,7 +3613,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3627,14 +3627,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3646,7 +3646,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3679,7 +3679,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3714,7 +3714,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3729,7 +3729,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3752,7 +3752,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3764,7 +3764,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3777,7 +3777,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3809,7 +3809,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3821,7 +3821,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3839,14 +3839,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3857,7 +3857,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3876,7 +3876,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3894,9 +3894,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3907,14 +3907,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3927,7 +3927,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3954,7 +3954,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3976,7 +3976,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4008,7 +4008,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4017,7 +4017,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/models/xgb/index.html b/docs/API/models/xgb/index.html index 0fa8ebafc..9251e0b9f 100644 --- a/docs/API/models/xgb/index.html +++ b/docs/API/models/xgb/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3375,7 +3375,7 @@

    Data attributes

    mapping: dict

    Encoded values and their respective mapped values.

    The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, -etc...).

    dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).

    +etc...).dataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3392,19 +3392,19 @@

    Utility attributes

  • score: Objective score(s) of the trial.
  • time_trial: Duration of the trial.
  • time_ht: Duration of the hyperparameter tuning.
  • -
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: Trial | None
    Trial that returned the highest score.
  • +
  • state: Trial's state (COMPLETE, PRUNED, FAIL).best_trial: FrozenTrial | None
    Trial that returned the highest score.
  • For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's @setter to change the best trial. See [here][example-hyperparameter-tuning] -an example.best_params: dict

    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: int | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: Predictor
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    +an example.
    best_params: dict
    Hyperparameters used by the best trial.
    score_ht: float | list[float] | None
    Metric score obtained by the best trial.
    time_ht: float | None
    Duration of the hyperparameter tuning (in seconds).
    estimator: PREDICTOR
    Estimator fitted on the training set.
    evals: dict
    Scores obtained per iteration of the training.

    Only the scores of the main metric are tracked. Included keys are: train and test. Read more in the -user guide.

    score_train: float | list[float]
    Metric score on the training set.
    score_test: float | list[float]
    Metric score on the test set.
    score_holdout: float | list[float]
    Metric score on the holdout set.
    time_fit: int
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    +user guide.
    score_train: SCALAR | list[SCALAR]
    Metric score on the training set.
    score_test: SCALAR | list[SCALAR]
    Metric score on the test set.
    score_holdout: SCALAR | list[SCALAR]
    Metric score on the holdout set.
    time_fit: float
    Duration of the model fitting on the train set (in seconds).
    bootstrap: pd.DataFrame | None
    Overview of the bootstrapping scores.

    The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using atom.bootstrap.mean() yields the same values as -score_bootstrap.

    score_bootstrap: float | list[float] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: int | None
    Duration of the bootstrapping (in seconds).
    time: int
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    +score_bootstrap.
    score_bootstrap: SCALAR | list[SCALAR] | None
    Mean metric score on the bootstrapped samples.
    time_bootstrap: float | None
    Duration of the bootstrapping (in seconds).
    time: float
    Total duration of the run (in seconds).
    feature_importance: pd.Series | None
    Normalized feature importance scores.

    The sum of importances for all features is 1. The scores are extracted from the estimator's scores_, coef_ or feature_importances_ attribute, checked in that order. @@ -3488,7 +3488,7 @@

    Methods

    bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the model's pipeline to a sklearn-like object.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.logPrint message and save to log file.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.


    -

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    +

    method bootstrapping(n_bootstrap, reset=False)[source]
    Apply a bootstrap algorithm.

    Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.

    @@ -3499,7 +3499,7 @@

    Methods



    -

    method calibrate(**kwargs)[source]
    Calibrate the model.

    +

    method calibrate(**kwargs)[source]
    Calibrate the model.

    Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier @@ -3516,7 +3516,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3561,7 +3561,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from the model.

    +

    method clear()[source]
    Reset attributes and clear cache from the model.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The @@ -3576,7 +3576,7 @@

    Methods

  • Cached holdout data sets


  • -

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    +

    method create_app(**kwargs)[source]
    Create an interactive app to test model predictions.

    Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed @@ -3588,7 +3588,7 @@

    Methods



    -

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    +

    method create_dashboard(dataset="test", filename=None, **kwargs)[source]
    Create an interactive dashboard to analyze the model.

    ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows @@ -3619,7 +3619,7 @@

    Methods



    -

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    +

    method cross_validate(**kwargs)[source]
    Evaluate the model using cross-validation.

    This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.

    @@ -3633,14 +3633,14 @@

    Methods



    -

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    +

    method decision_function(X, verbose=None)[source]
    Get confidence scores on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a decision_function method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3652,7 +3652,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get the model's scores for the provided metrics.

    Tip

    Use the self-get_best_threshold or plot_threshold @@ -3685,7 +3685,7 @@

    Methods



    -

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    +

    method export_pipeline(memory=None, verbose=None)[source]
    Export the model's pipeline to a sklearn-like object.

    The returned pipeline is already fitted on the training set. Note that, if the model used automated feature scaling, the Scaler is added to the pipeline.

    @@ -3720,7 +3720,7 @@

    Methods



    -

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    +

    method fit(X=None, y=None)[source]
    Fit and validate the model.

    The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the @@ -3735,7 +3735,7 @@

    Methods



    -

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    +

    method full_train(include_holdout=False)[source]
    Train the estimator on the complete dataset.

    In some cases it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly @@ -3758,7 +3758,7 @@

    Methods



    -

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    +

    method get_best_threshold(dataset="train")[source]
    Get the threshold that maximizes the ROC curve.

    Only available for models with a predict_proba method in a binary or multilabel classification task.

    @@ -3770,7 +3770,7 @@

    Methods



    -

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    +

    method hyperparameter_tuning(n_trials, reset=False)[source]
    Run the hyperparameter tuning algorithm.

    Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split @@ -3783,7 +3783,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    +

    method inverse_transform(X=None, y=None, verbose=None)[source]
    Inversely transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores @@ -3815,7 +3815,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3827,7 +3827,7 @@

    Methods



    -

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    +

    method predict(X, verbose=None)[source]
    Get predictions on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict method.

    @@ -3845,14 +3845,14 @@

    Methods



    -

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    +

    method predict_log_proba(X, verbose=None)[source]
    Get class log-probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_log_proba method.

    Read more in the user guide.

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    verbose: int or None, default=None
    Verbosity level of the output. If None, it uses the @@ -3863,7 +3863,7 @@

    Methods



    -

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    +

    method predict_proba(X, verbose=None)[source]
    Get class probabilities on new data or existing rows.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a predict_proba method.

    @@ -3882,7 +3882,7 @@

    Methods



    -

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    +

    method register(name=None, stage="None", archive_existing_versions=False)[source]
    Register the model in mlflow's model registry.

    This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.

    @@ -3900,9 +3900,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3913,14 +3913,14 @@

    Methods



    -

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    +

    method save_estimator(filename="auto")[source]
    Save the estimator to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.



    -

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    +

    method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]
    Get a metric score on new data.

    New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.

    @@ -3933,7 +3933,7 @@

    Methods

    ParametersX: int, str, slice, sequence or dataframe-like
    -Names or indices of rows in the dataset, or new feature +Names or positions of rows in the dataset, or new feature set with shape=(n_samples, n_features).

    y: int, str, dict, sequence, dataframe or None, default=None
    Target column corresponding to X.

    @@ -3960,7 +3960,7 @@

    Methods



    -

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    +

    method serve(method="predict", host="127.0.0.1", port=8000)[source]
    Serve the model as rest API endpoint for inference.

    The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. requests.get("http://127.0.0.1:8000/", json=X.to_json()). @@ -3982,7 +3982,7 @@

    Methods



    -

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    +

    method transform(X=None, y=None, verbose=None)[source]
    Transform new data through the pipeline.

    Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be @@ -4014,7 +4014,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -4023,7 +4023,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    diff --git a/docs/API/nlp/textcleaner/index.html b/docs/API/nlp/textcleaner/index.html index 1b978fd25..eedc93231 100644 --- a/docs/API/nlp/textcleaner/index.html +++ b/docs/API/nlp/textcleaner/index.html @@ -1212,7 +1212,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1254,7 +1254,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1296,7 +1296,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3240,7 +3240,7 @@

    TextCleaner


    -

    class atom.nlp.TextCleaner(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, verbose=0, logger=None)[source]
    Applies standard text cleaning to the corpus.

    +

    class atom.nlp.TextCleaner(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, verbose=0, logger=None)[source]
    Applies standard text cleaning to the corpus.

    Transformations include normalizing characters and dropping noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in @@ -3360,7 +3360,7 @@

    Methods

    fitDoes nothing.
    fit_transformFit to data, then transform it.
    get_metadata_routingGet metadata routing of this object.
    get_paramsGet parameters for this estimator.
    inverse_transformDoes nothing.
    logPrint message and save to log file.
    saveSave the instance to a pickle file.
    set_paramsSet the parameters of this estimator.
    transformApply the transformations to the data.


    -

    method fit(X=None, y=None, **fit_params)[source]
    Does nothing.

    +

    method fit(X=None, y=None, **fit_params)[source]
    Does nothing.

    Implemented for continuity of the API.

    ParametersX: dataframe-like or None, default=None
    @@ -3385,7 +3385,7 @@

    Methods



    -

    method fit_transform(X=None, y=None, **fit_params)[source]
    Fit to data, then transform it.

    +

    method fit_transform(X=None, y=None, **fit_params)[source]
    Fit to data, then transform it.

    ParametersX: dataframe-like or None, default=None
    Feature set with shape=(n_samples, n_features). If None, @@ -3429,7 +3429,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None)[source]
    Does nothing.

    +

    method inverse_transform(X=None, y=None)[source]
    Does nothing.

    ParametersX: dataframe-like or None, default=None
    Feature set with shape=(n_samples, n_features). If None, @@ -3453,7 +3453,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3465,7 +3465,7 @@

    Methods



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3485,7 +3485,7 @@

    Methods



    -

    method transform(X, y=None)[source]
    Apply the transformations to the data.

    +

    method transform(X, y=None)[source]
    Apply the transformations to the data.

    ParametersX: dataframe-like
    Feature set with shape=(n_samples, n_features). If X is diff --git a/docs/API/nlp/textnormalizer/index.html b/docs/API/nlp/textnormalizer/index.html index 85fb70120..1c83d1c2f 100644 --- a/docs/API/nlp/textnormalizer/index.html +++ b/docs/API/nlp/textnormalizer/index.html @@ -1212,7 +1212,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1254,7 +1254,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1296,7 +1296,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3240,7 +3240,7 @@

    TextNormalizer


    -

    class atom.nlp.TextNormalizer(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, verbose=0, logger=None)[source]
    Normalize the corpus.

    +

    class atom.nlp.TextNormalizer(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, verbose=0, logger=None)[source]
    Normalize the corpus.

    Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, @@ -3347,7 +3347,7 @@

    Methods

    fitDoes nothing.
    fit_transformFit to data, then transform it.
    get_metadata_routingGet metadata routing of this object.
    get_paramsGet parameters for this estimator.
    inverse_transformDoes nothing.
    logPrint message and save to log file.
    saveSave the instance to a pickle file.
    set_paramsSet the parameters of this estimator.
    transformNormalize the text.


    -

    method fit(X=None, y=None, **fit_params)[source]
    Does nothing.

    +

    method fit(X=None, y=None, **fit_params)[source]
    Does nothing.

    Implemented for continuity of the API.

    ParametersX: dataframe-like or None, default=None
    @@ -3372,7 +3372,7 @@

    Methods



    -

    method fit_transform(X=None, y=None, **fit_params)[source]
    Fit to data, then transform it.

    +

    method fit_transform(X=None, y=None, **fit_params)[source]
    Fit to data, then transform it.

    ParametersX: dataframe-like or None, default=None
    Feature set with shape=(n_samples, n_features). If None, @@ -3416,7 +3416,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None)[source]
    Does nothing.

    +

    method inverse_transform(X=None, y=None)[source]
    Does nothing.

    ParametersX: dataframe-like or None, default=None
    Feature set with shape=(n_samples, n_features). If None, @@ -3440,7 +3440,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3452,7 +3452,7 @@

    Methods



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3472,7 +3472,7 @@

    Methods



    -

    method transform(X, y=None)[source]
    Normalize the text.

    +

    method transform(X, y=None)[source]
    Normalize the text.

    ParametersX: dataframe-like
    Feature set with shape=(n_samples, n_features). If X is diff --git a/docs/API/nlp/tokenizer/index.html b/docs/API/nlp/tokenizer/index.html index 1d9edecc0..561a470e7 100644 --- a/docs/API/nlp/tokenizer/index.html +++ b/docs/API/nlp/tokenizer/index.html @@ -1212,7 +1212,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1254,7 +1254,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1296,7 +1296,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3240,7 +3240,7 @@

    Tokenizer


    -

    class atom.nlp.Tokenizer(bigram_freq=None, trigram_freq=None, quadgram_freq=None, verbose=0, logger=None)[source]
    Tokenize the corpus.

    +

    class atom.nlp.Tokenizer(bigram_freq=None, trigram_freq=None, quadgram_freq=None, verbose=0, logger=None)[source]
    Tokenize the corpus.

    Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g. "New_York") based on their frequency in the corpus. The @@ -3352,7 +3352,7 @@

    Methods

    fitDoes nothing.
    fit_transformFit to data, then transform it.
    get_metadata_routingGet metadata routing of this object.
    get_paramsGet parameters for this estimator.
    inverse_transformDoes nothing.
    logPrint message and save to log file.
    saveSave the instance to a pickle file.
    set_paramsSet the parameters of this estimator.
    transformTokenize the text.


    -

    method fit(X=None, y=None, **fit_params)[source]
    Does nothing.

    +

    method fit(X=None, y=None, **fit_params)[source]
    Does nothing.

    Implemented for continuity of the API.

    ParametersX: dataframe-like or None, default=None
    @@ -3377,7 +3377,7 @@

    Methods



    -

    method fit_transform(X=None, y=None, **fit_params)[source]
    Fit to data, then transform it.

    +

    method fit_transform(X=None, y=None, **fit_params)[source]
    Fit to data, then transform it.

    ParametersX: dataframe-like or None, default=None
    Feature set with shape=(n_samples, n_features). If None, @@ -3421,7 +3421,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None)[source]
    Does nothing.

    +

    method inverse_transform(X=None, y=None)[source]
    Does nothing.

    ParametersX: dataframe-like or None, default=None
    Feature set with shape=(n_samples, n_features). If None, @@ -3445,7 +3445,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3457,7 +3457,7 @@

    Methods



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3477,7 +3477,7 @@

    Methods



    -

    method transform(X, y=None)[source]
    Tokenize the text.

    +

    method transform(X, y=None)[source]
    Tokenize the text.

    ParametersX: dataframe-like
    Feature set with shape=(n_samples, n_features). If X is diff --git a/docs/API/nlp/vectorizer/index.html b/docs/API/nlp/vectorizer/index.html index 0aef8c576..d0321af38 100644 --- a/docs/API/nlp/vectorizer/index.html +++ b/docs/API/nlp/vectorizer/index.html @@ -1212,7 +1212,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1254,7 +1254,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1296,7 +1296,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3240,7 +3240,7 @@

    Vectorizer


    -

    class atom.nlp.Vectorizer(strategy="bow", return_sparse=True, device="cpu", engine=None, verbose=0, logger=None, **kwargs)[source]
    Vectorize text data.

    +

    class atom.nlp.Vectorizer(strategy="bow", return_sparse=True, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]
    Vectorize text data.

    Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

    @@ -3266,17 +3266,16 @@

    Vectorizer

    follows the SYCL_DEVICE_FILTER filter selector, e.g. device="gpu" to use the GPU. Read more in the user guide.

    -

    engine: dict or None, default=None
    +

    engine: dict, default={"data": "numpy", "estimator": "sklearn"}
    Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

    +corresponding choice as values. Choose from:

    • "data":

        -
      • "numpy" (default)
      • +
      • "numpy"
      • "pyarrow"
      • "modin"
      @@ -3284,7 +3283,7 @@

      Vectorizer

    • "estimator":

        -
      • "sklearn" (default)
      • +
      • "sklearn"
      • "cuml"
    • @@ -3375,7 +3374,7 @@

      Methods

    fitFit to data.
    fit_transformFit to data, then transform it.
    get_metadata_routingGet metadata routing of this object.
    get_paramsGet parameters for this estimator.
    inverse_transformDoes nothing.
    logPrint message and save to log file.
    saveSave the instance to a pickle file.
    set_paramsSet the parameters of this estimator.
    transformVectorize the text.


    -

    method fit(X, y=None)[source]
    Fit to data.

    +

    method fit(X, y=None)[source]
    Fit to data.

    ParametersX: dataframe-like
    Feature set with shape=(n_samples, n_features). If X is @@ -3388,7 +3387,7 @@

    Methods



    -

    method fit_transform(X=None, y=None, **fit_params)[source]
    Fit to data, then transform it.

    +

    method fit_transform(X=None, y=None, **fit_params)[source]
    Fit to data, then transform it.

    ParametersX: dataframe-like or None, default=None
    Feature set with shape=(n_samples, n_features). If None, @@ -3432,7 +3431,7 @@

    Methods



    -

    method inverse_transform(X=None, y=None)[source]
    Does nothing.

    +

    method inverse_transform(X=None, y=None)[source]
    Does nothing.

    ParametersX: dataframe-like or None, default=None
    Feature set with shape=(n_samples, n_features). If None, @@ -3456,7 +3455,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3468,7 +3467,7 @@

    Methods



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3488,7 +3487,7 @@

    Methods



    -

    method transform(X, y=None)[source]
    Vectorize the text.

    +

    method transform(X, y=None)[source]
    Vectorize the text.

    ParametersX: dataframe-like
    Feature set with shape=(n_samples, n_features). If X is diff --git a/docs/API/plots/plot_calibration/index.html b/docs/API/plots/plot_calibration/index.html index 751cdc431..88adc595b 100644 --- a/docs/API/plots/plot_calibration/index.html +++ b/docs/API/plots/plot_calibration/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_calibration


    -

    method plot_calibration(models=None, dataset="test", n_bins=10, target=0, title=None, legend="upper left", figsize=(900, 900), filename=None, display=True)[source]
    Plot the calibration curve for a binary classifier.

    +

    method plot_calibration(models=None, dataset="test", n_bins=10, target=0, title=None, legend="upper left", figsize=(900, 900), filename=None, display=True)[source]
    Plot the calibration curve for a binary classifier.

    Well calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level. For instance a well diff --git a/docs/API/plots/plot_components/index.html b/docs/API/plots/plot_components/index.html index 8457eac0c..9cbf19248 100644 --- a/docs/API/plots/plot_components/index.html +++ b/docs/API/plots/plot_components/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_components


    -

    method plot_components(show=None, title=None, legend="lower right", figsize=None, filename=None, display=True)[source]
    Plot the explained variance ratio per component.

    +

    method plot_components(show=None, title=None, legend="lower right", figsize=None, filename=None, display=True)[source]
    Plot the explained variance ratio per component.

    Kept components are colored and discarted components are transparent. This plot is available only when feature selection was applied with strategy="pca".

    diff --git a/docs/API/plots/plot_confusion_matrix/index.html b/docs/API/plots/plot_confusion_matrix/index.html index 4e3c98f13..d5766735d 100644 --- a/docs/API/plots/plot_confusion_matrix/index.html +++ b/docs/API/plots/plot_confusion_matrix/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_confusion_matrix


    -

    method plot_confusion_matrix(models=None, dataset="test", target=0, threshold=0.5, title=None, legend="upper right", figsize=None, filename=None, display=True)[source]
    Plot a model's confusion matrix.

    +

    method plot_confusion_matrix(models=None, dataset="test", target=0, threshold=0.5, title=None, legend="upper right", figsize=None, filename=None, display=True)[source]
    Plot a model's confusion matrix.

    For one model, the plot shows a heatmap. For multiple models, it compares TP, FP, FN and TN in a barplot (not implemented for multiclass classification tasks). This plot is available diff --git a/docs/API/plots/plot_correlation/index.html b/docs/API/plots/plot_correlation/index.html index c3554e24d..d62f88d25 100644 --- a/docs/API/plots/plot_correlation/index.html +++ b/docs/API/plots/plot_correlation/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_correlation


    -

    method plot_correlation(columns=None, method="pearson", title=None, legend=None, figsize=(800, 700), filename=None, display=True)[source]
    Plot a correlation matrix.

    +

    method plot_correlation(columns=None, method="pearson", title=None, legend=None, figsize=(800, 700), filename=None, display=True)[source]
    Plot a correlation matrix.

    Displays a heatmap showing the correlation between columns in the dataset. The colors red, blue and white stand for positive, negative, and no correlation respectively.

    diff --git a/docs/API/plots/plot_det/index.html b/docs/API/plots/plot_det/index.html index 67645ea1a..b7472dda9 100644 --- a/docs/API/plots/plot_det/index.html +++ b/docs/API/plots/plot_det/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_det


    -

    method plot_det(models=None, dataset="test", target=0, title=None, legend="upper right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the Detection Error Tradeoff curve.

    +

    method plot_det(models=None, dataset="test", target=0, title=None, legend="upper right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the Detection Error Tradeoff curve.

    Read more about DET in sklearn's documentation. Only available for binary classification tasks.

    diff --git a/docs/API/plots/plot_distribution/index.html b/docs/API/plots/plot_distribution/index.html index 598511187..49c1fb4bf 100644 --- a/docs/API/plots/plot_distribution/index.html +++ b/docs/API/plots/plot_distribution/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_distribution


    -

    method plot_distribution(columns=0, distributions=None, show=None, title=None, legend="upper right", figsize=None, filename=None, display=True)[source]
    Plot column distributions.

    +

    method plot_distribution(columns=0, distributions=None, show=None, title=None, legend="upper right", figsize=None, filename=None, display=True)[source]
    Plot column distributions.

    • For numerical columns, plot the probability density distribution. Additionally, it's possible to plot any of diff --git a/docs/API/plots/plot_edf/index.html b/docs/API/plots/plot_edf/index.html index d67f420b5..943b62063 100644 --- a/docs/API/plots/plot_edf/index.html +++ b/docs/API/plots/plot_edf/index.html @@ -1162,7 +1162,7 @@
    • - DirectRegressor + DirectForecaster
    • @@ -1204,7 +1204,7 @@
    • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
    • @@ -1246,7 +1246,7 @@
    • - TrainSizingRegressor + TrainSizingForecaster
    • @@ -3226,7 +3226,7 @@

      plot_edf


      -

      method plot_edf(models=None, metric=None, title=None, legend="upper left", figsize=(900, 600), filename=None, display=True)[source]
      Plot the Empirical Distribution Function of a study.

      +

      method plot_edf(models=None, metric=None, title=None, legend="upper left", figsize=(900, 600), filename=None, display=True)[source]
      Plot the Empirical Distribution Function of a study.

      Use this plot to analyze and improve hyperparameter search spaces. The EDF assumes that the value of the objective function is in accordance with the uniform distribution over diff --git a/docs/API/plots/plot_errors/index.html b/docs/API/plots/plot_errors/index.html index f80c71f1a..8df41553f 100644 --- a/docs/API/plots/plot_errors/index.html +++ b/docs/API/plots/plot_errors/index.html @@ -1162,7 +1162,7 @@

    • - DirectRegressor + DirectForecaster
    • @@ -1204,7 +1204,7 @@
    • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
    • @@ -1246,7 +1246,7 @@
    • - TrainSizingRegressor + TrainSizingForecaster
    • @@ -3226,7 +3226,7 @@

      plot_errors


      -

      method plot_errors(models=None, dataset="test", target=0, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
      Plot a model's prediction errors.

      +

      method plot_errors(models=None, dataset="test", target=0, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
      Plot a model's prediction errors.

      Plot the actual targets from a set against the predicted values generated by the regressor. A linear fit is made on the data. The gray, intersected line shows the identity line. This plot diff --git a/docs/API/plots/plot_evals/index.html b/docs/API/plots/plot_evals/index.html index 820cac3a4..2c34c6103 100644 --- a/docs/API/plots/plot_evals/index.html +++ b/docs/API/plots/plot_evals/index.html @@ -1162,7 +1162,7 @@

    • - DirectRegressor + DirectForecaster
    • @@ -1204,7 +1204,7 @@
    • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
    • @@ -1246,7 +1246,7 @@
    • - TrainSizingRegressor + TrainSizingForecaster
    • @@ -3226,7 +3226,7 @@

      plot_evals


      -

      method plot_evals(models=None, dataset="test", title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
      Plot evaluation curves.

      +

      method plot_evals(models=None, dataset="test", title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
      Plot evaluation curves.

      The evaluation curves are the main metric scores achieved by the models at every iteration of the training process. This plot is available only for models that allow in-training validation.

      diff --git a/docs/API/plots/plot_feature_importance/index.html b/docs/API/plots/plot_feature_importance/index.html index 6b76d9d13..4c546224d 100644 --- a/docs/API/plots/plot_feature_importance/index.html +++ b/docs/API/plots/plot_feature_importance/index.html @@ -1162,7 +1162,7 @@
    • - DirectRegressor + DirectForecaster
    • @@ -1204,7 +1204,7 @@
    • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
    • @@ -1246,7 +1246,7 @@
    • - TrainSizingRegressor + TrainSizingForecaster
    • @@ -3226,7 +3226,7 @@

      plot_feature_importance


      -

      method plot_feature_importance(models=None, show=None, title=None, legend="lower right", figsize=None, filename=None, display=True)[source]
      Plot a model's feature importance.

      +

      method plot_feature_importance(models=None, show=None, title=None, legend="lower right", figsize=None, filename=None, display=True)[source]
      Plot a model's feature importance.

      The sum of importances for all features (per model) is 1. This plot is available only for models whose estimator has a scores_, feature_importances_ or coef attribute.

      diff --git a/docs/API/plots/plot_forecast/index.html b/docs/API/plots/plot_forecast/index.html index 646e6e0f0..fbf3e017a 100644 --- a/docs/API/plots/plot_forecast/index.html +++ b/docs/API/plots/plot_forecast/index.html @@ -1162,7 +1162,7 @@
    • - DirectRegressor + DirectForecaster
    • @@ -1204,7 +1204,7 @@
    • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
    • @@ -1246,7 +1246,7 @@
    • - TrainSizingRegressor + TrainSizingForecaster
    • @@ -3226,7 +3226,7 @@

      plot_forecast


      -

      method plot_forecast(models=None, fh="test", X=None, target=0, plot_interval=True, title=None, legend="upper left", figsize=(900, 600), filename=None, display=True)[source]
      Plot a time series with model forecasts.

      +

      method plot_forecast(models=None, fh="test", X=None, target=0, plot_interval=True, title=None, legend="upper left", figsize=(900, 600), filename=None, display=True)[source]
      Plot a time series with model forecasts.

      This plot is only available for forecasting tasks.

    Parametersmodels: int, str, Model, slice, sequence or None, default=None
    diff --git a/docs/API/plots/plot_gains/index.html b/docs/API/plots/plot_gains/index.html index 288ff960d..24bde5903 100644 --- a/docs/API/plots/plot_gains/index.html +++ b/docs/API/plots/plot_gains/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_gains


    -

    method plot_gains(models=None, dataset="test", target=0, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the cumulative gains curve.

    +

    method plot_gains(models=None, dataset="test", target=0, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the cumulative gains curve.

    This plot is available only for binary and multilabel classification tasks.

    diff --git a/docs/API/plots/plot_hyperparameter_importance/index.html b/docs/API/plots/plot_hyperparameter_importance/index.html index 1d7cf5c50..0c278044d 100644 --- a/docs/API/plots/plot_hyperparameter_importance/index.html +++ b/docs/API/plots/plot_hyperparameter_importance/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_hyperparameter_importance


    -

    method plot_hyperparameter_importance(models=None, metric=0, show=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot a model's hyperparameter importance.

    +

    method plot_hyperparameter_importance(models=None, metric=0, show=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot a model's hyperparameter importance.

    The hyperparameter importance are calculated using the fANOVA importance evaluator. The sum of importances for all parameters (per model) is 1. This plot is only available for diff --git a/docs/API/plots/plot_hyperparameters/index.html b/docs/API/plots/plot_hyperparameters/index.html index b3985f061..6e20583bb 100644 --- a/docs/API/plots/plot_hyperparameters/index.html +++ b/docs/API/plots/plot_hyperparameters/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_hyperparameters


    -

    method plot_hyperparameters(models=None, params=(0, 1), metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot hyperparameter relationships in a study.

    +

    method plot_hyperparameters(models=None, params=(0, 1), metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot hyperparameter relationships in a study.

    A model's hyperparameters are plotted against each other. The corresponding metric scores are displayed in a contour plot. The markers are the trials in the study. This plot is only diff --git a/docs/API/plots/plot_learning_curve/index.html b/docs/API/plots/plot_learning_curve/index.html index 2c8ea5f98..100e1747a 100644 --- a/docs/API/plots/plot_learning_curve/index.html +++ b/docs/API/plots/plot_learning_curve/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_learning_curve


    -

    method plot_learning_curve(models=None, metric=None, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the learning curve: score vs number of training samples.

    +

    method plot_learning_curve(models=None, metric=None, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the learning curve: score vs number of training samples.

    This plot is available only for models fitted using train sizing. Ensembles are ignored.

    diff --git a/docs/API/plots/plot_lift/index.html b/docs/API/plots/plot_lift/index.html index 9ff25a71d..80ae5e1ca 100644 --- a/docs/API/plots/plot_lift/index.html +++ b/docs/API/plots/plot_lift/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_lift


    -

    method plot_lift(models=None, dataset="test", target=0, title=None, legend="upper right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the lift curve.

    +

    method plot_lift(models=None, dataset="test", target=0, title=None, legend="upper right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the lift curve.

    Only available for binary classification tasks.

    Parametersmodels: int, str, Model, slice, sequence or None, default=None
    diff --git a/docs/API/plots/plot_ngrams/index.html b/docs/API/plots/plot_ngrams/index.html index deb3483a4..a6aef3c64 100644 --- a/docs/API/plots/plot_ngrams/index.html +++ b/docs/API/plots/plot_ngrams/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_ngrams


    -

    method plot_ngrams(ngram="bigram", index=None, show=10, title=None, legend="lower right", figsize=None, filename=None, display=True)[source]
    Plot n-gram frequencies.

    +

    method plot_ngrams(ngram="bigram", index=None, show=10, title=None, legend="lower right", figsize=None, filename=None, display=True)[source]
    Plot n-gram frequencies.

    The text for the plot is extracted from the column named corpus. If there is no column with that name, an exception is raised. If the documents are not tokenized, the words are diff --git a/docs/API/plots/plot_parallel_coordinate/index.html b/docs/API/plots/plot_parallel_coordinate/index.html index d1a1f3967..c4696eb6d 100644 --- a/docs/API/plots/plot_parallel_coordinate/index.html +++ b/docs/API/plots/plot_parallel_coordinate/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_parallel_coordinate


    -

    method plot_parallel_coordinate(models=None, params=None, metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot high-dimensional parameter relationships in a study.

    +

    method plot_parallel_coordinate(models=None, params=None, metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot high-dimensional parameter relationships in a study.

    Every line of the plot represents one trial. This plot is only available for models that ran hyperparameter tuning.

    diff --git a/docs/API/plots/plot_pareto_front/index.html b/docs/API/plots/plot_pareto_front/index.html index 36c14508a..9e938c45c 100644 --- a/docs/API/plots/plot_pareto_front/index.html +++ b/docs/API/plots/plot_pareto_front/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_pareto_front


    -

    method plot_pareto_front(models=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot the Pareto front of a study.

    +

    method plot_pareto_front(models=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot the Pareto front of a study.

    Shows the trial scores plotted against each other. The marker's colors indicate the trial number. This plot is only available for models that ran multi-metric runs with diff --git a/docs/API/plots/plot_parshap/index.html b/docs/API/plots/plot_parshap/index.html index 500a31baf..2e8234215 100644 --- a/docs/API/plots/plot_parshap/index.html +++ b/docs/API/plots/plot_parshap/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_parshap


    -

    method plot_parshap(models=None, columns=None, target=1, title=None, legend="upper left", figsize=(900, 600), filename=None, display=True)[source]
    Plot the partial correlation of shap values.

    +

    method plot_parshap(models=None, columns=None, target=1, title=None, legend="upper left", figsize=(900, 600), filename=None, display=True)[source]
    Plot the partial correlation of shap values.

    Plots the train and test correlation between the shap value of every feature with its target value, after removing the effect of all other features (partial correlation). This plot is diff --git a/docs/API/plots/plot_partial_dependence/index.html b/docs/API/plots/plot_partial_dependence/index.html index 601304cb5..660e2b59d 100644 --- a/docs/API/plots/plot_partial_dependence/index.html +++ b/docs/API/plots/plot_partial_dependence/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_partial_dependence


    -

    method plot_partial_dependence(models=None, columns=None, kind="average", pair=None, target=1, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the partial dependence of features.

    +

    method plot_partial_dependence(models=None, columns=None, kind="average", pair=None, target=1, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the partial dependence of features.

    The partial dependence of a feature (or a set of features) corresponds to the response of the model for each possible value of the feature. The plot can take two forms:

    diff --git a/docs/API/plots/plot_pca/index.html b/docs/API/plots/plot_pca/index.html index afdbc4087..e3077a7d9 100644 --- a/docs/API/plots/plot_pca/index.html +++ b/docs/API/plots/plot_pca/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_pca


    -

    method plot_pca(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]
    Plot the explained variance ratio vs number of components.

    +

    method plot_pca(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]
    Plot the explained variance ratio vs number of components.

    If the underlying estimator is PCA (for dense datasets), all possible components are plotted. If the underlying estimator is TruncatedSVD (for sparse datasets), it only shows the diff --git a/docs/API/plots/plot_permutation_importance/index.html b/docs/API/plots/plot_permutation_importance/index.html index dcb30cb1f..a2c15e8bc 100644 --- a/docs/API/plots/plot_permutation_importance/index.html +++ b/docs/API/plots/plot_permutation_importance/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_permutation_importance


    -

    method plot_permutation_importance(models=None, show=None, n_repeats=10, title=None, legend="lower right", figsize=None, filename=None, display=True)[source]
    Plot the feature permutation importance of models.

    +

    method plot_permutation_importance(models=None, show=None, n_repeats=10, title=None, legend="lower right", figsize=None, filename=None, display=True)[source]
    Plot the feature permutation importance of models.

    Warning

    This method can be slow. Results are cached to fasten diff --git a/docs/API/plots/plot_pipeline/index.html b/docs/API/plots/plot_pipeline/index.html index a3d080fca..190019354 100644 --- a/docs/API/plots/plot_pipeline/index.html +++ b/docs/API/plots/plot_pipeline/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_pipeline


    -

    method plot_pipeline(models=None, draw_hyperparameter_tuning=True, color_branches=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot a diagram of the pipeline.

    +

    method plot_pipeline(models=None, draw_hyperparameter_tuning=True, color_branches=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot a diagram of the pipeline.

    Warning

    This plot uses the schemdraw package, which is diff --git a/docs/API/plots/plot_prc/index.html b/docs/API/plots/plot_prc/index.html index 06120a14f..4393c84e6 100644 --- a/docs/API/plots/plot_prc/index.html +++ b/docs/API/plots/plot_prc/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_prc


    -

    method plot_prc(models=None, dataset="test", target=0, title=None, legend="lower left", figsize=(900, 600), filename=None, display=True)[source]
    Plot the precision-recall curve.

    +

    method plot_prc(models=None, dataset="test", target=0, title=None, legend="lower left", figsize=(900, 600), filename=None, display=True)[source]
    Plot the precision-recall curve.

    Read more about PRC in sklearn's documentation. Only available for binary classification tasks.

    diff --git a/docs/API/plots/plot_probabilities/index.html b/docs/API/plots/plot_probabilities/index.html index d3a741948..7821af324 100644 --- a/docs/API/plots/plot_probabilities/index.html +++ b/docs/API/plots/plot_probabilities/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_probabilities


    -

    method plot_probabilities(models=None, dataset="test", target=1, title=None, legend="upper right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the probability distribution of the target classes.

    +

    method plot_probabilities(models=None, dataset="test", target=1, title=None, legend="upper right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the probability distribution of the target classes.

    This plot is available only for models with a predict_proba method in classification tasks.

    diff --git a/docs/API/plots/plot_qq/index.html b/docs/API/plots/plot_qq/index.html index 127164819..8415f5ea2 100644 --- a/docs/API/plots/plot_qq/index.html +++ b/docs/API/plots/plot_qq/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_qq


    -

    method plot_qq(columns=0, distributions="norm", title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot a quantile-quantile plot.

    +

    method plot_qq(columns=0, distributions="norm", title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot a quantile-quantile plot.

    Columns are distinguished by color and the distributions are distinguished by marker type. Missing values are ignored.

    diff --git a/docs/API/plots/plot_relationships/index.html b/docs/API/plots/plot_relationships/index.html index 166e34b75..c299da0e2 100644 --- a/docs/API/plots/plot_relationships/index.html +++ b/docs/API/plots/plot_relationships/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_relationships


    -

    method plot_relationships(columns=(0, 1, 2), title=None, legend=None, figsize=(900, 900), filename=None, display=True)[source]
    Plot pairwise relationships in a dataset.

    +

    method plot_relationships(columns=(0, 1, 2), title=None, legend=None, figsize=(900, 900), filename=None, display=True)[source]
    Plot pairwise relationships in a dataset.

    Creates a grid of axes such that each numerical column appears once on the x-axes and once on the y-axes. The bottom triangle contains scatter plots (max 250 random samples), the diagonal diff --git a/docs/API/plots/plot_residuals/index.html b/docs/API/plots/plot_residuals/index.html index 89837bd62..f56412738 100644 --- a/docs/API/plots/plot_residuals/index.html +++ b/docs/API/plots/plot_residuals/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_residuals


    -

    method plot_residuals(models=None, dataset="test", target=0, title=None, legend="upper left", figsize=(900, 600), filename=None, display=True)[source]
    Plot a model's residuals.

    +

    method plot_residuals(models=None, dataset="test", target=0, title=None, legend="upper left", figsize=(900, 600), filename=None, display=True)[source]
    Plot a model's residuals.

    The plot shows the residuals (difference between the predicted and the true value) on the vertical axis and the independent variable on the horizontal axis. The gray, intersected line diff --git a/docs/API/plots/plot_results/index.html b/docs/API/plots/plot_results/index.html index 7c7b1bb73..899161221 100644 --- a/docs/API/plots/plot_results/index.html +++ b/docs/API/plots/plot_results/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_results


    -

    method plot_results(models=None, metric=None, title=None, legend="lower right", figsize=None, filename=None, display=True)[source]
    Plot the model results.

    +

    method plot_results(models=None, metric=None, title=None, legend="lower right", figsize=None, filename=None, display=True)[source]
    Plot the model results.

    If all models applied bootstrap, the plot is a boxplot. If not, the plot is a barplot. Models are ordered based on their score from the top down. The score is either the diff --git a/docs/API/plots/plot_rfecv/index.html b/docs/API/plots/plot_rfecv/index.html index 07275b10b..ed73cf2b6 100644 --- a/docs/API/plots/plot_rfecv/index.html +++ b/docs/API/plots/plot_rfecv/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_rfecv


    -

    method plot_rfecv(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]
    Plot the rfecv results.

    +

    method plot_rfecv(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]
    Plot the rfecv results.

    Plot the scores obtained by the estimator fitted on every subset of the dataset. Only available when feature selection was applied with strategy="rfecv".

    diff --git a/docs/API/plots/plot_roc/index.html b/docs/API/plots/plot_roc/index.html index 119a4297f..a0731761d 100644 --- a/docs/API/plots/plot_roc/index.html +++ b/docs/API/plots/plot_roc/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_roc


    -

    method plot_roc(models=None, dataset="test", target=0, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the Receiver Operating Characteristics curve.

    +

    method plot_roc(models=None, dataset="test", target=0, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the Receiver Operating Characteristics curve.

    Read more about ROC in sklearn's documentation. Only available for classification tasks.

    diff --git a/docs/API/plots/plot_shap_bar/index.html b/docs/API/plots/plot_shap_bar/index.html index b0a177a32..3dd47443f 100644 --- a/docs/API/plots/plot_shap_bar/index.html +++ b/docs/API/plots/plot_shap_bar/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_shap_bar


    -

    method plot_shap_bar(models=None, index=None, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot SHAP's bar plot.

    +

    method plot_shap_bar(models=None, index=None, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot SHAP's bar plot.

    Create a bar plot of a set of SHAP values. If a single sample is passed, then the SHAP values are plotted. If many samples are passed, then the mean absolute value for each feature diff --git a/docs/API/plots/plot_shap_beeswarm/index.html b/docs/API/plots/plot_shap_beeswarm/index.html index 90577fd2d..e60ea226a 100644 --- a/docs/API/plots/plot_shap_beeswarm/index.html +++ b/docs/API/plots/plot_shap_beeswarm/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_shap_beeswarm


    -

    method plot_shap_beeswarm(models=None, index=None, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot SHAP's beeswarm plot.

    +

    method plot_shap_beeswarm(models=None, index=None, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot SHAP's beeswarm plot.

    The plot is colored by feature values. Read more about SHAP plots in the user guide.

    diff --git a/docs/API/plots/plot_shap_decision/index.html b/docs/API/plots/plot_shap_decision/index.html index b6777f22f..30f483eb5 100644 --- a/docs/API/plots/plot_shap_decision/index.html +++ b/docs/API/plots/plot_shap_decision/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_shap_decision


    -

    method plot_shap_decision(models=None, index=None, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot SHAP's decision plot.

    +

    method plot_shap_decision(models=None, index=None, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot SHAP's decision plot.

    Visualize model decisions using cumulative SHAP values. Each plotted line explains a single model prediction. If a single prediction is plotted, feature values are printed in the diff --git a/docs/API/plots/plot_shap_force/index.html b/docs/API/plots/plot_shap_force/index.html index 917cfeb07..64d9079d2 100644 --- a/docs/API/plots/plot_shap_force/index.html +++ b/docs/API/plots/plot_shap_force/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_shap_force


    -

    method plot_shap_force(models=None, index=None, target=1, title=None, legend=None, figsize=(900, 300), filename=None, display=True, **kwargs)[source]
    Plot SHAP's force plot.

    +

    method plot_shap_force(models=None, index=None, target=1, title=None, legend=None, figsize=(900, 300), filename=None, display=True, **kwargs)[source]
    Plot SHAP's force plot.

    Visualize the given SHAP values with an additive force layout. Note that by default this plot will render using javascript. For a regular figure use matplotlib=True (this option is diff --git a/docs/API/plots/plot_shap_heatmap/index.html b/docs/API/plots/plot_shap_heatmap/index.html index 1d2bf8f55..697460e56 100644 --- a/docs/API/plots/plot_shap_heatmap/index.html +++ b/docs/API/plots/plot_shap_heatmap/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_shap_heatmap


    -

    method plot_shap_heatmap(models=None, index=None, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot SHAP's heatmap plot.

    +

    method plot_shap_heatmap(models=None, index=None, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot SHAP's heatmap plot.

    This plot is designed to show the population substructure of a dataset using supervised clustering and a heatmap. Supervised clustering involves clustering data points not by their original diff --git a/docs/API/plots/plot_shap_scatter/index.html b/docs/API/plots/plot_shap_scatter/index.html index 3baa5faff..8a5680b1f 100644 --- a/docs/API/plots/plot_shap_scatter/index.html +++ b/docs/API/plots/plot_shap_scatter/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_shap_scatter


    -

    method plot_shap_scatter(models=None, index=None, columns=0, target=1, title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]
    Plot SHAP's scatter plot.

    +

    method plot_shap_scatter(models=None, index=None, columns=0, target=1, title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]
    Plot SHAP's scatter plot.

    Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extension of diff --git a/docs/API/plots/plot_shap_waterfall/index.html b/docs/API/plots/plot_shap_waterfall/index.html index 7a85f4845..506ed4e89 100644 --- a/docs/API/plots/plot_shap_waterfall/index.html +++ b/docs/API/plots/plot_shap_waterfall/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_shap_waterfall


    -

    method plot_shap_waterfall(models=None, index=None, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot SHAP's waterfall plot.

    +

    method plot_shap_waterfall(models=None, index=None, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot SHAP's waterfall plot.

    The SHAP value of a feature represents the impact of the evidence provided by that feature on the model’s output. The waterfall plot is designed to visually display how the SHAP diff --git a/docs/API/plots/plot_slice/index.html b/docs/API/plots/plot_slice/index.html index 41b26b41b..e54db2230 100644 --- a/docs/API/plots/plot_slice/index.html +++ b/docs/API/plots/plot_slice/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_slice


    -

    method plot_slice(models=None, params=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot the parameter relationship in a study.

    +

    method plot_slice(models=None, params=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]
    Plot the parameter relationship in a study.

    The color of the markers indicate the trial. This plot is only available for models that ran hyperparameter tuning.

    diff --git a/docs/API/plots/plot_successive_halving/index.html b/docs/API/plots/plot_successive_halving/index.html index 38cf2e24b..d2e58e4af 100644 --- a/docs/API/plots/plot_successive_halving/index.html +++ b/docs/API/plots/plot_successive_halving/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_successive_halving


    -

    method plot_successive_halving(models=None, metric=None, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot scores per iteration of the successive halving.

    +

    method plot_successive_halving(models=None, metric=None, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot scores per iteration of the successive halving.

    Only use with models fitted using successive halving. Ensembles are ignored.

    diff --git a/docs/API/plots/plot_terminator_improvement/index.html b/docs/API/plots/plot_terminator_improvement/index.html index d4fb32ce2..b637f665a 100644 --- a/docs/API/plots/plot_terminator_improvement/index.html +++ b/docs/API/plots/plot_terminator_improvement/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_terminator_improvement


    -

    method plot_terminator_improvement(models=None, title=None, legend="upper right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the potentials for future objective improvement.

    +

    method plot_terminator_improvement(models=None, title=None, legend="upper right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the potentials for future objective improvement.

    This function visualizes the objective improvement potentials. It helps to determine whether you should continue the optimization or not. The evaluated error is also plotted. Note diff --git a/docs/API/plots/plot_threshold/index.html b/docs/API/plots/plot_threshold/index.html index ab5873885..eecd628e8 100644 --- a/docs/API/plots/plot_threshold/index.html +++ b/docs/API/plots/plot_threshold/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_threshold


    -

    method plot_threshold(models=None, metric=None, dataset="test", target=0, steps=100, title=None, legend="lower left", figsize=(900, 600), filename=None, display=True)[source]
    Plot metric performances against threshold values.

    +

    method plot_threshold(models=None, metric=None, dataset="test", target=0, steps=100, title=None, legend="lower left", figsize=(900, 600), filename=None, display=True)[source]
    Plot metric performances against threshold values.

    This plot is available only for models with a predict_proba method in a binary or multilabel classification task.

    diff --git a/docs/API/plots/plot_timeline/index.html b/docs/API/plots/plot_timeline/index.html index 9e4a31f34..017fed6cb 100644 --- a/docs/API/plots/plot_timeline/index.html +++ b/docs/API/plots/plot_timeline/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_timeline


    -

    method plot_timeline(models=None, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the timeline of a study.

    +

    method plot_timeline(models=None, title=None, legend="lower right", figsize=(900, 600), filename=None, display=True)[source]
    Plot the timeline of a study.

    This plot is only available for models that ran hyperparameter tuning.

    diff --git a/docs/API/plots/plot_trials/index.html b/docs/API/plots/plot_trials/index.html index 983a07d68..c660f41a0 100644 --- a/docs/API/plots/plot_trials/index.html +++ b/docs/API/plots/plot_trials/index.html @@ -1162,7 +1162,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_trials


    -

    method plot_trials(models=None, metric=None, title=None, legend="upper left", figsize=(900, 800), filename=None, display=True)[source]
    Plot the hyperparameter tuning trials.

    +

    method plot_trials(models=None, metric=None, title=None, legend="upper left", figsize=(900, 800), filename=None, display=True)[source]
    Plot the hyperparameter tuning trials.

    Creates a figure with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. The best trial is indicated with a star. diff --git a/docs/API/plots/plot_wordcloud/index.html b/docs/API/plots/plot_wordcloud/index.html index 646e41f80..3952c97aa 100644 --- a/docs/API/plots/plot_wordcloud/index.html +++ b/docs/API/plots/plot_wordcloud/index.html @@ -1162,7 +1162,7 @@

  • - DirectRegressor + DirectForecaster
  • @@ -1204,7 +1204,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1246,7 +1246,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3226,7 +3226,7 @@

    plot_wordcloud


    -

    method plot_wordcloud(index=None, title=None, legend=None, figsize=(900, 600), filename=None, display=True, **kwargs)[source]
    Plot a wordcloud from the corpus.

    +

    method plot_wordcloud(index=None, title=None, legend=None, figsize=(900, 600), filename=None, display=True, **kwargs)[source]
    Plot a wordcloud from the corpus.

    The text for the plot is extracted from the column named corpus. If there is no column with that name, an exception is raised.

    diff --git a/docs/API/training/directclassifier/index.html b/docs/API/training/directclassifier/index.html index 9e0abc154..b350f860a 100644 --- a/docs/API/training/directclassifier/index.html +++ b/docs/API/training/directclassifier/index.html @@ -1253,7 +1253,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1295,7 +1295,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1337,7 +1337,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3322,7 +3322,7 @@

    DirectClassifier


    -

    class atom.training.DirectClassifier(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine=None, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
    Train and evaluate the models in a direct fashion.

    +

    class atom.training.DirectClassifier(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
    Train and evaluate the models in a direct fashion.

    The following steps are applied to every model:

    1. Apply hyperparameter tuning (optional).
    2. @@ -3403,7 +3403,7 @@

      DirectClassifier

    3. "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials.
    4. n_jobs: int, default=1

      @@ -3418,17 +3418,16 @@

      DirectClassifier

      follows the SYCL_DEVICE_FILTER filter selector, e.g. device="gpu" to use the GPU. Read more in the user guide.

      -

      engine: dict or None, default=None
      +

      engine: dict, default={"data": "numpy", "estimator": "sklearn"}
      Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

      +corresponding choice as values. Choose from:

      • "data":

          -
        • "numpy" (default)
        • +
        • "numpy"
        • "pyarrow"
        • "modin"
        @@ -3436,7 +3435,7 @@

        DirectClassifier

      • "estimator":

          -
        • "sklearn" (default)
        • +
        • "sklearn"
        • "sklearnex"
        • "cuml"
        @@ -3516,7 +3515,7 @@

        Data attributes

        Updating the dataset will automatically update the response of these attributes accordingly.

    -

    +

    Attributesdataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: series
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: series
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).
    Attributesdataset: dataframe
    Complete data set.
    train: dataframe
    Training set.
    test: dataframe
    Test set.
    X: dataframe
    Feature set.
    y: series | dataframe
    Target column(s).
    X_train: dataframe
    Features of the training set.
    y_train: series | dataframe
    Target column(s) of the training set.
    X_test: dataframe
    Features of the test set.
    y_test: series | dataframe
    Target column(s) of the test set.
    shape: tuple[int, int]
    Shape of the dataset (n_rows, n_columns).
    columns: index
    Name of all the columns.
    n_columns: int
    Number of columns.
    features: index
    Name of the features.
    n_features: int
    Number of features.
    target: str | list[str]
    Name of the target column(s).


    Utility attributes

    @@ -3560,7 +3559,7 @@

    Plot attributes

    The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

    -

    Attributespalette: str | SEQUENCE
    Color palette.

    +

    Attributespalette: str | sequence
    Color palette.

    Specify one of plotly's built-in palettes or create a custom one, e.g. atom.palette = ["red", "green", "blue"].

    title_fontsize: int
    Fontsize for the plot's title.
    label_fontsize: int
    Fontsize for the labels, legend and hover information.
    tick_fontsize: int
    Fontsize for the ticks along the plot's axes.
    line_width: int
    Width of the line plots.
    marker_size: int
    Size of the markers.
    @@ -3572,7 +3571,7 @@

    Methods

    available_modelsGive an overview of the available predefined models.
    canvasCreate a figure with multiple plots.
    clearReset attributes and clear cache from all models.
    deleteDelete models.
    evaluateGet all models' scores for the provided metrics.
    export_pipelineExport the pipeline to a sklearn-like object.
    get_class_weightReturn class weights for a balanced data set.
    get_paramsGet parameters for this estimator.
    logPrint message and save to log file.
    mergeMerge another instance of the same class into this one.
    update_layoutUpdate the properties of the plot's layout.
    update_tracesUpdate the properties of the plot's traces.
    reset_aestheticsReset the plot aesthetics to their default values.
    runTrain and evaluate the models.
    saveSave the instance to a pickle file.
    set_paramsSet the parameters of this estimator.
    stackingAdd a Stacking model to the pipeline.
    votingAdd a Voting model to the pipeline.


    -

    method available_models()[source]
    Give an overview of the available predefined models.

    +

    method available_models()[source]
    Give an overview of the available predefined models.

    Returnspd.DataFrame
    Information about the available predefined models. Columns @@ -3594,7 +3593,7 @@

    Methods



    -

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    +

    method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
    Create a figure with multiple plots.

    This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

    @@ -3639,7 +3638,7 @@

    Methods



    -

    method clear()[source]
    Reset attributes and clear cache from all models.

    +

    method clear()[source]
    Reset attributes and clear cache from all models.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected @@ -3654,7 +3653,7 @@

    Methods

  • Cached holdout data sets


  • -

    method delete(models=None)[source]
    Delete models.

    +

    method delete(models=None)[source]
    Delete models.

    If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3665,7 +3664,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    Parametersmetric: str, func, scorer, sequence or None, default=None
    Metric to calculate. If None, it returns an overview of @@ -3692,7 +3691,7 @@

    Methods



    -

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    +

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

    @@ -3731,7 +3730,7 @@

    Methods



    -

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    +

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -3756,7 +3755,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3768,7 +3767,7 @@

    Methods



    -

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    +

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -3786,7 +3785,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3795,7 +3794,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    @@ -3804,9 +3803,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method run(*arrays)[source]
    Train and evaluate the models.

    +

    method run(*arrays)[source]
    Train and evaluate the models.

    Read more in the user guide.

    Parameters*arrays: sequence of indexables
    @@ -3819,7 +3818,7 @@

    Methods



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3839,7 +3838,7 @@

    Methods



    -

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    +

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -3859,7 +3858,7 @@

    Methods



    -

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    +

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -3924,13 +3923,13 @@

    Methods

    - n_jobs: int, default=1
    @@ -3413,17 +3413,16 @@

    DirectForecaster

    follows the SYCL_DEVICE_FILTER filter selector, e.g. device="gpu" to use the GPU. Read more in the user guide.

    -

    engine: dict or None, default=None
    +

    engine: dict, default={"data": "numpy", "estimator": "sklearn"}
    Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

    +corresponding choice as values. Choose from:



    -

    method clear()[source]
    Reset attributes and clear cache from all models.

    +

    method clear()[source]
    Reset attributes and clear cache from all models.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected @@ -3646,7 +3645,7 @@

    Methods

  • Cached holdout data sets


  • -

    method delete(models=None)[source]
    Delete models.

    +

    method delete(models=None)[source]
    Delete models.

    If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3657,7 +3656,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    Parametersmetric: str, func, scorer, sequence or None, default=None
    Metric to calculate. If None, it returns an overview of @@ -3684,7 +3683,7 @@

    Methods



    -

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    +

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

    @@ -3723,7 +3722,7 @@

    Methods



    -

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    +

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -3748,7 +3747,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3760,7 +3759,7 @@

    Methods



    -

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    +

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -3778,7 +3777,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3787,7 +3786,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    @@ -3796,9 +3795,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method run(*arrays)[source]
    Train and evaluate the models.

    +

    method run(*arrays)[source]
    Train and evaluate the models.

    Read more in the user guide.

    Parameters*arrays: sequence of indexables
    @@ -3811,7 +3810,7 @@

    Methods



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3831,7 +3830,7 @@

    Methods



    -

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    +

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -3851,7 +3850,7 @@

    Methods



    -

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    +

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -4029,4 +4028,4 @@

    Methods

    - \ No newline at end of file + diff --git a/docs/API/training/directregressor/index.html b/docs/API/training/directregressor/index.html index 0a1a60fa6..48e70803b 100644 --- a/docs/API/training/directregressor/index.html +++ b/docs/API/training/directregressor/index.html @@ -1164,7 +1164,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1295,7 +1295,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1337,7 +1337,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3322,7 +3322,7 @@

    DirectRegressor


    -

    class atom.training.DirectRegressor(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine=None, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
    Train and evaluate the models in a direct fashion.

    +

    class atom.training.DirectRegressor(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
    Train and evaluate the models in a direct fashion.

    The following steps are applied to every model:

    1. Apply hyperparameter tuning (optional).
    2. @@ -3398,7 +3398,7 @@

      DirectRegressor

    3. "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials.
    n_jobs: int, default=1
    @@ -3413,17 +3413,16 @@

    DirectRegressor

    follows the SYCL_DEVICE_FILTER filter selector, e.g. device="gpu" to use the GPU. Read more in the user guide.

    -

    engine: dict or None, default=None
    +

    engine: dict, default={"data": "numpy", "estimator": "sklearn"}
    Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

    +corresponding choice as values. Choose from:



    -

    method clear()[source]
    Reset attributes and clear cache from all models.

    +

    method clear()[source]
    Reset attributes and clear cache from all models.

    Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected @@ -3649,7 +3648,7 @@

    Methods

  • Cached holdout data sets


  • -

    method delete(models=None)[source]
    Delete models.

    +

    method delete(models=None)[source]
    Delete models.

    If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3660,7 +3659,7 @@

    Methods



    -

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    +

    method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
    Get all models' scores for the provided metrics.

    Parametersmetric: str, func, scorer, sequence or None, default=None
    Metric to calculate. If None, it returns an overview of @@ -3687,7 +3686,7 @@

    Methods



    -

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    +

    method export_pipeline(model=None, memory=None, verbose=None)[source]
    Export the pipeline to a sklearn-like object.

    Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

    @@ -3726,7 +3725,7 @@

    Methods



    -

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    +

    method get_class_weight(dataset="train")[source]
    Return class weights for a balanced data set.

    Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -3751,7 +3750,7 @@

    Methods



    -

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    +

    method log(msg, level=0, severity="info")[source]
    Print message and save to log file.

    Parametersmsg: int, float or str
    Message to save to the logger and print to stdout.

    @@ -3763,7 +3762,7 @@

    Methods



    -

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    +

    method merge(other, suffix="2")[source]
    Merge another instance of the same class into this one.

    Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -3781,7 +3780,7 @@

    Methods



    -

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    +

    method update_layout(**kwargs)[source]
    Update the properties of the plot's layout.

    Recursively update the structure of the original layout with the values in the arguments.

    @@ -3790,7 +3789,7 @@

    Methods



    -

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    +

    method update_traces(**kwargs)[source]
    Update the properties of the plot's traces.

    Recursively update the structure of the original traces with the values in the arguments.

    @@ -3799,9 +3798,9 @@

    Methods



    -

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.

    +

    method reset_aesthetics()[source]
    Reset the plot aesthetics to their default values.



    -

    method run(*arrays)[source]
    Train and evaluate the models.

    +

    method run(*arrays)[source]
    Train and evaluate the models.

    Read more in the user guide.

    Parameters*arrays: sequence of indexables
    @@ -3814,7 +3813,7 @@

    Methods



    -

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    +

    method save(filename="auto", save_data=True)[source]
    Save the instance to a pickle file.

    Parametersfilename: str, default="auto"
    Name of the file. Use "auto" for automatic naming.

    @@ -3834,7 +3833,7 @@

    Methods



    -

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    +

    method stacking(models=None, name="Stack", **kwargs)[source]
    Add a Stacking model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -3854,7 +3853,7 @@

    Methods



    -

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    +

    method voting(models=None, name="Vote", **kwargs)[source]
    Add a Voting model to the pipeline.

    Warning

    Combining models trained on different branches into one @@ -3903,7 +3902,7 @@

    Methods

    @@ -4032,4 +4031,4 @@

    Methods

    - \ No newline at end of file + diff --git a/docs/API/training/successivehalvingclassifier/index.html b/docs/API/training/successivehalvingclassifier/index.html index 371156460..da2fd03ff 100644 --- a/docs/API/training/successivehalvingclassifier/index.html +++ b/docs/API/training/successivehalvingclassifier/index.html @@ -1164,7 +1164,7 @@
  • - DirectRegressor + DirectForecaster
  • @@ -1295,7 +1295,7 @@
  • - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
  • @@ -1337,7 +1337,7 @@
  • - TrainSizingRegressor + TrainSizingForecaster
  • @@ -3322,7 +3322,7 @@

    SuccessiveHalvingClassifier


    -

    class atom.training.SuccessiveHalvingClassifier(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine=None, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
    Train and evaluate the models in a successive halving fashion.

    +

    class atom.training.SuccessiveHalvingClassifier(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
    Train and evaluate the models in a successive halving fashion.

    The following steps are applied to every model (per iteration):

    1. Apply hyperparameter tuning (optional).
    2. @@ -3405,7 +3405,7 @@

      SuccessiveHalvingClassifier

    3. "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials.
    4. n_jobs: int, default=1

      @@ -3420,17 +3420,16 @@

      SuccessiveHalvingClassifier

      follows the SYCL_DEVICE_FILTER filter selector, e.g. device="gpu" to use the GPU. Read more in the user guide.

      -

      engine: dict or None, default=None
      +

      engine: dict, default={"data": "numpy", "estimator": "sklearn"}
      Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

      +corresponding choice as values. Choose from:



      -

      method clear()[source]
      Reset attributes and clear cache from all models.

      +

      method clear()[source]
      Reset attributes and clear cache from all models.

      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected @@ -3656,7 +3655,7 @@

      Methods

    5. Cached holdout data sets


    6. -

      method delete(models=None)[source]
      Delete models.

      +

      method delete(models=None)[source]
      Delete models.

      If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3667,7 +3666,7 @@

      Methods



      -

      method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
      Get all models' scores for the provided metrics.

      +

      method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
      Get all models' scores for the provided metrics.

      Parametersmetric: str, func, scorer, sequence or None, default=None
      Metric to calculate. If None, it returns an overview of @@ -3694,7 +3693,7 @@

      Methods



      -

      method export_pipeline(model=None, memory=None, verbose=None)[source]
      Export the pipeline to a sklearn-like object.

      +

      method export_pipeline(model=None, memory=None, verbose=None)[source]
      Export the pipeline to a sklearn-like object.

      Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

      @@ -3733,7 +3732,7 @@

      Methods



      -

      method get_class_weight(dataset="train")[source]
      Return class weights for a balanced data set.

      +

      method get_class_weight(dataset="train")[source]
      Return class weights for a balanced data set.

      Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -3758,7 +3757,7 @@

      Methods



      -

      method log(msg, level=0, severity="info")[source]
      Print message and save to log file.

      +

      method log(msg, level=0, severity="info")[source]
      Print message and save to log file.

      Parametersmsg: int, float or str
      Message to save to the logger and print to stdout.

      @@ -3770,7 +3769,7 @@

      Methods



      -

      method merge(other, suffix="2")[source]
      Merge another instance of the same class into this one.

      +

      method merge(other, suffix="2")[source]
      Merge another instance of the same class into this one.

      Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -3788,7 +3787,7 @@

      Methods



      -

      method update_layout(**kwargs)[source]
      Update the properties of the plot's layout.

      +

      method update_layout(**kwargs)[source]
      Update the properties of the plot's layout.

      Recursively update the structure of the original layout with the values in the arguments.

      @@ -3797,7 +3796,7 @@

      Methods



      -

      method update_traces(**kwargs)[source]
      Update the properties of the plot's traces.

      +

      method update_traces(**kwargs)[source]
      Update the properties of the plot's traces.

      Recursively update the structure of the original traces with the values in the arguments.

      @@ -3806,9 +3805,9 @@

      Methods



      -

      method reset_aesthetics()[source]
      Reset the plot aesthetics to their default values.

      +

      method reset_aesthetics()[source]
      Reset the plot aesthetics to their default values.



      -

      method run(*arrays)[source]
      Train and evaluate the models.

      +

      method run(*arrays)[source]
      Train and evaluate the models.

      Read more in the user guide.

      Parameters*arrays: sequence of indexables
      @@ -3821,7 +3820,7 @@

      Methods



      -

      method save(filename="auto", save_data=True)[source]
      Save the instance to a pickle file.

      +

      method save(filename="auto", save_data=True)[source]
      Save the instance to a pickle file.

      Parametersfilename: str, default="auto"
      Name of the file. Use "auto" for automatic naming.

      @@ -3841,7 +3840,7 @@

      Methods



      -

      method stacking(models=None, name="Stack", **kwargs)[source]
      Add a Stacking model to the pipeline.

      +

      method stacking(models=None, name="Stack", **kwargs)[source]
      Add a Stacking model to the pipeline.

      Warning

      Combining models trained on different branches into one @@ -3861,7 +3860,7 @@

      Methods



      -

      method voting(models=None, name="Vote", **kwargs)[source]
      Add a Voting model to the pipeline.

      +

      method voting(models=None, name="Vote", **kwargs)[source]
      Add a Voting model to the pipeline.

      Warning

      Combining models trained on different branches into one @@ -3926,13 +3925,13 @@

      Methods

      - n_jobs: int, default=1
      @@ -3415,17 +3415,16 @@

      SuccessiveHalvingForecaster

      follows the SYCL_DEVICE_FILTER filter selector, e.g. device="gpu" to use the GPU. Read more in the user guide.

      -

      engine: dict or None, default=None
      +

      engine: dict, default={"data": "numpy", "estimator": "sklearn"}
      Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

      +corresponding choice as values. Choose from:

      • "data":

          -
        • "numpy" (default)
        • +
        • "numpy"
        • "pyarrow"
        • "modin"
        @@ -3433,7 +3432,7 @@

        SuccessiveHalvingForecaster

      • "estimator":

          -
        • "sklearn" (default)
        • +
        • "sklearn"
        • "sklearnex"
        • "cuml"
        @@ -3510,7 +3509,7 @@

        Data attributes

        Updating the dataset will automatically update the response of these attributes accordingly.

        -

        +

        Attributesdataset: dataframe
        Complete data set.
        train: dataframe
        Training set.
        test: dataframe
        Test set.
        X: dataframe
        Feature set.
        y: series | dataframe
        Target column(s).
        X_train: dataframe
        Features of the training set.
        y_train: series | dataframe
        Target column(s) of the training set.
        X_test: dataframe
        Features of the test set.
        y_test: series | dataframe
        Target column(s) of the test set.
        shape: tuple[int, int]
        Shape of the dataset (n_rows, n_columns).
        columns: series
        Name of all the columns.
        n_columns: int
        Number of columns.
        features: series
        Name of the features.
        n_features: int
        Number of features.
        target: str | list[str]
        Name of the target column(s).
        Attributesdataset: dataframe
        Complete data set.
        train: dataframe
        Training set.
        test: dataframe
        Test set.
        X: dataframe
        Feature set.
        y: series | dataframe
        Target column(s).
        X_train: dataframe
        Features of the training set.
        y_train: series | dataframe
        Target column(s) of the training set.
        X_test: dataframe
        Features of the test set.
        y_test: series | dataframe
        Target column(s) of the test set.
        shape: tuple[int, int]
        Shape of the dataset (n_rows, n_columns).
        columns: index
        Name of all the columns.
        n_columns: int
        Number of columns.
        features: index
        Name of the features.
        n_features: int
        Number of features.
        target: str | list[str]
        Name of the target column(s).


        Utility attributes

        @@ -3554,7 +3553,7 @@

        Plot attributes

        The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

        -

        Attributespalette: str | SEQUENCE
        Color palette.

        +

        Attributespalette: str | sequence
        Color palette.

        Specify one of plotly's built-in palettes or create a custom one, e.g. atom.palette = ["red", "green", "blue"].

        title_fontsize: int
        Fontsize for the plot's title.
        label_fontsize: int
        Fontsize for the labels, legend and hover information.
        tick_fontsize: int
        Fontsize for the ticks along the plot's axes.
        line_width: int
        Width of the line plots.
        marker_size: int
        Size of the markers.
        @@ -3566,7 +3565,7 @@

        Methods

        available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the pipeline to a sklearn-like object.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.logPrint message and save to log file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.


        -

        method available_models()[source]
        Give an overview of the available predefined models.

        +

        method available_models()[source]
        Give an overview of the available predefined models.

        Returnspd.DataFrame
        Information about the available predefined models. Columns @@ -3588,7 +3587,7 @@

        Methods



        -

        method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
        Create a figure with multiple plots.

        +

        method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
        Create a figure with multiple plots.

        This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

        @@ -3633,7 +3632,7 @@

        Methods



      -

      method clear()[source]
      Reset attributes and clear cache from all models.

      +

      method clear()[source]
      Reset attributes and clear cache from all models.

      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected @@ -3648,7 +3647,7 @@

      Methods

    7. Cached holdout data sets


    8. -

      method delete(models=None)[source]
      Delete models.

      +

      method delete(models=None)[source]
      Delete models.

      If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3659,7 +3658,7 @@

      Methods



      -

      method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
      Get all models' scores for the provided metrics.

      +

      method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
      Get all models' scores for the provided metrics.

      Parametersmetric: str, func, scorer, sequence or None, default=None
      Metric to calculate. If None, it returns an overview of @@ -3686,7 +3685,7 @@

      Methods



      -

      method export_pipeline(model=None, memory=None, verbose=None)[source]
      Export the pipeline to a sklearn-like object.

      +

      method export_pipeline(model=None, memory=None, verbose=None)[source]
      Export the pipeline to a sklearn-like object.

      Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

      @@ -3725,7 +3724,7 @@

      Methods



      -

      method get_class_weight(dataset="train")[source]
      Return class weights for a balanced data set.

      +

      method get_class_weight(dataset="train")[source]
      Return class weights for a balanced data set.

      Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -3750,7 +3749,7 @@

      Methods



      -

      method log(msg, level=0, severity="info")[source]
      Print message and save to log file.

      +

      method log(msg, level=0, severity="info")[source]
      Print message and save to log file.

      Parametersmsg: int, float or str
      Message to save to the logger and print to stdout.

      @@ -3762,7 +3761,7 @@

      Methods



      -

      method merge(other, suffix="2")[source]
      Merge another instance of the same class into this one.

      +

      method merge(other, suffix="2")[source]
      Merge another instance of the same class into this one.

      Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -3780,7 +3779,7 @@

      Methods



      -

      method update_layout(**kwargs)[source]
      Update the properties of the plot's layout.

      +

      method update_layout(**kwargs)[source]
      Update the properties of the plot's layout.

      Recursively update the structure of the original layout with the values in the arguments.

      @@ -3789,7 +3788,7 @@

      Methods



      -

      method update_traces(**kwargs)[source]
      Update the properties of the plot's traces.

      +

      method update_traces(**kwargs)[source]
      Update the properties of the plot's traces.

      Recursively update the structure of the original traces with the values in the arguments.

      @@ -3798,9 +3797,9 @@

      Methods



      -

      method reset_aesthetics()[source]
      Reset the plot aesthetics to their default values.

      +

      method reset_aesthetics()[source]
      Reset the plot aesthetics to their default values.



      -

      method run(*arrays)[source]
      Train and evaluate the models.

      +

      method run(*arrays)[source]
      Train and evaluate the models.

      Read more in the user guide.

      Parameters*arrays: sequence of indexables
      @@ -3813,7 +3812,7 @@

      Methods



      -

      method save(filename="auto", save_data=True)[source]
      Save the instance to a pickle file.

      +

      method save(filename="auto", save_data=True)[source]
      Save the instance to a pickle file.

      Parametersfilename: str, default="auto"
      Name of the file. Use "auto" for automatic naming.

      @@ -3833,7 +3832,7 @@

      Methods



      -

      method stacking(models=None, name="Stack", **kwargs)[source]
      Add a Stacking model to the pipeline.

      +

      method stacking(models=None, name="Stack", **kwargs)[source]
      Add a Stacking model to the pipeline.

      Warning

      Combining models trained on different branches into one @@ -3853,7 +3852,7 @@

      Methods



      -

      method voting(models=None, name="Vote", **kwargs)[source]
      Add a Voting model to the pipeline.

      +

      method voting(models=None, name="Vote", **kwargs)[source]
      Add a Voting model to the pipeline.

      Warning

      Combining models trained on different branches into one @@ -4031,4 +4030,4 @@

      Methods

      - \ No newline at end of file + diff --git a/docs/API/training/successivehalvingregressor/index.html b/docs/API/training/successivehalvingregressor/index.html index 8a2f7b4af..a15e0a62f 100644 --- a/docs/API/training/successivehalvingregressor/index.html +++ b/docs/API/training/successivehalvingregressor/index.html @@ -1164,7 +1164,7 @@
    9. - DirectRegressor + DirectForecaster
    10. @@ -1206,7 +1206,7 @@
    11. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
    12. @@ -1337,7 +1337,7 @@
    13. - TrainSizingRegressor + TrainSizingForecaster
    14. @@ -3322,7 +3322,7 @@

      SuccessiveHalvingRegressor


      -

      class atom.training.SuccessiveHalvingRegressor(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine=None, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
      Train and evaluate the models in a successive halving fashion.

      +

      class atom.training.SuccessiveHalvingRegressor(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
      Train and evaluate the models in a successive halving fashion.

      The following steps are applied to every model (per iteration):

      1. Apply hyperparameter tuning (optional).
      2. @@ -3400,7 +3400,7 @@

        SuccessiveHalvingRegressor

      3. "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials.
      n_jobs: int, default=1
      @@ -3415,17 +3415,16 @@

      SuccessiveHalvingRegressor

      follows the SYCL_DEVICE_FILTER filter selector, e.g. device="gpu" to use the GPU. Read more in the user guide.

      -

      engine: dict or None, default=None
      +

      engine: dict, default={"data": "numpy", "estimator": "sklearn"}
      Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

      +corresponding choice as values. Choose from:



      -

      method clear()[source]
      Reset attributes and clear cache from all models.

      +

      method clear()[source]
      Reset attributes and clear cache from all models.

      Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected @@ -3651,7 +3650,7 @@

      Methods

    15. Cached holdout data sets


    16. -

      method delete(models=None)[source]
      Delete models.

      +

      method delete(models=None)[source]
      Delete models.

      If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3662,7 +3661,7 @@

      Methods



      -

      method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
      Get all models' scores for the provided metrics.

      +

      method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
      Get all models' scores for the provided metrics.

      Parametersmetric: str, func, scorer, sequence or None, default=None
      Metric to calculate. If None, it returns an overview of @@ -3689,7 +3688,7 @@

      Methods



      -

      method export_pipeline(model=None, memory=None, verbose=None)[source]
      Export the pipeline to a sklearn-like object.

      +

      method export_pipeline(model=None, memory=None, verbose=None)[source]
      Export the pipeline to a sklearn-like object.

      Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

      @@ -3728,7 +3727,7 @@

      Methods



      -

      method get_class_weight(dataset="train")[source]
      Return class weights for a balanced data set.

      +

      method get_class_weight(dataset="train")[source]
      Return class weights for a balanced data set.

      Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -3753,7 +3752,7 @@

      Methods



      -

      method log(msg, level=0, severity="info")[source]
      Print message and save to log file.

      +

      method log(msg, level=0, severity="info")[source]
      Print message and save to log file.

      Parametersmsg: int, float or str
      Message to save to the logger and print to stdout.

      @@ -3765,7 +3764,7 @@

      Methods



      -

      method merge(other, suffix="2")[source]
      Merge another instance of the same class into this one.

      +

      method merge(other, suffix="2")[source]
      Merge another instance of the same class into this one.

      Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -3783,7 +3782,7 @@

      Methods



      -

      method update_layout(**kwargs)[source]
      Update the properties of the plot's layout.

      +

      method update_layout(**kwargs)[source]
      Update the properties of the plot's layout.

      Recursively update the structure of the original layout with the values in the arguments.

      @@ -3792,7 +3791,7 @@

      Methods



      -

      method update_traces(**kwargs)[source]
      Update the properties of the plot's traces.

      +

      method update_traces(**kwargs)[source]
      Update the properties of the plot's traces.

      Recursively update the structure of the original traces with the values in the arguments.

      @@ -3801,9 +3800,9 @@

      Methods



      -

      method reset_aesthetics()[source]
      Reset the plot aesthetics to their default values.

      +

      method reset_aesthetics()[source]
      Reset the plot aesthetics to their default values.



      -

      method run(*arrays)[source]
      Train and evaluate the models.

      +

      method run(*arrays)[source]
      Train and evaluate the models.

      Read more in the user guide.

      Parameters*arrays: sequence of indexables
      @@ -3816,7 +3815,7 @@

      Methods



      -

      method save(filename="auto", save_data=True)[source]
      Save the instance to a pickle file.

      +

      method save(filename="auto", save_data=True)[source]
      Save the instance to a pickle file.

      Parametersfilename: str, default="auto"
      Name of the file. Use "auto" for automatic naming.

      @@ -3836,7 +3835,7 @@

      Methods



      -

      method stacking(models=None, name="Stack", **kwargs)[source]
      Add a Stacking model to the pipeline.

      +

      method stacking(models=None, name="Stack", **kwargs)[source]
      Add a Stacking model to the pipeline.

      Warning

      Combining models trained on different branches into one @@ -3856,7 +3855,7 @@

      Methods



      -

      method voting(models=None, name="Vote", **kwargs)[source]
      Add a Voting model to the pipeline.

      +

      method voting(models=None, name="Vote", **kwargs)[source]
      Add a Voting model to the pipeline.

      Warning

      Combining models trained on different branches into one @@ -3905,7 +3904,7 @@

      Methods

      @@ -4034,4 +4033,4 @@

      Methods

      - \ No newline at end of file + diff --git a/docs/API/training/trainsizingclassifier/index.html b/docs/API/training/trainsizingclassifier/index.html index 3003263c3..f4358d609 100644 --- a/docs/API/training/trainsizingclassifier/index.html +++ b/docs/API/training/trainsizingclassifier/index.html @@ -1164,7 +1164,7 @@
    17. - DirectRegressor + DirectForecaster
    18. @@ -1206,7 +1206,7 @@
    19. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
    20. @@ -1337,7 +1337,7 @@
    21. - TrainSizingRegressor + TrainSizingForecaster
    22. @@ -3322,7 +3322,7 @@

      TrainSizingClassifier


      -

      class atom.training.TrainSizingClassifier(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine=None, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
      Train and evaluate the models in a train sizing fashion.

      +

      class atom.training.TrainSizingClassifier(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
      Train and evaluate the models in a train sizing fashion.

      The following steps are applied to every model (per iteration):

      1. Apply hyperparameter tuning (optional).
      2. @@ -3411,7 +3411,7 @@

        TrainSizingClassifier

      3. "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials.
      4. n_jobs: int, default=1

        @@ -3426,17 +3426,16 @@

        TrainSizingClassifier

        follows the SYCL_DEVICE_FILTER filter selector, e.g. device="gpu" to use the GPU. Read more in the user guide.

        -

        engine: dict or None, default=None
        +

        engine: dict, default={"data": "numpy", "estimator": "sklearn"}
        Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

        +corresponding choice as values. Choose from:

        • "data":

            -
          • "numpy" (default)
          • +
          • "numpy"
          • "pyarrow"
          • "modin"
          @@ -3444,7 +3443,7 @@

          TrainSizingClassifier

        • "estimator":

            -
          • "sklearn" (default)
          • +
          • "sklearn"
          • "sklearnex"
          • "cuml"
          @@ -3524,7 +3523,7 @@

          Data attributes

          Updating the dataset will automatically update the response of these attributes accordingly.

          -

          +

          Attributesdataset: dataframe
          Complete data set.
          train: dataframe
          Training set.
          test: dataframe
          Test set.
          X: dataframe
          Feature set.
          y: series | dataframe
          Target column(s).
          X_train: dataframe
          Features of the training set.
          y_train: series | dataframe
          Target column(s) of the training set.
          X_test: dataframe
          Features of the test set.
          y_test: series | dataframe
          Target column(s) of the test set.
          shape: tuple[int, int]
          Shape of the dataset (n_rows, n_columns).
          columns: series
          Name of all the columns.
          n_columns: int
          Number of columns.
          features: series
          Name of the features.
          n_features: int
          Number of features.
          target: str | list[str]
          Name of the target column(s).
          Attributesdataset: dataframe
          Complete data set.
          train: dataframe
          Training set.
          test: dataframe
          Test set.
          X: dataframe
          Feature set.
          y: series | dataframe
          Target column(s).
          X_train: dataframe
          Features of the training set.
          y_train: series | dataframe
          Target column(s) of the training set.
          X_test: dataframe
          Features of the test set.
          y_test: series | dataframe
          Target column(s) of the test set.
          shape: tuple[int, int]
          Shape of the dataset (n_rows, n_columns).
          columns: index
          Name of all the columns.
          n_columns: int
          Number of columns.
          features: index
          Name of the features.
          n_features: int
          Number of features.
          target: str | list[str]
          Name of the target column(s).


          Utility attributes

          @@ -3568,7 +3567,7 @@

          Plot attributes

          The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

          -

          Attributespalette: str | SEQUENCE
          Color palette.

          +

          Attributespalette: str | sequence
          Color palette.

          Specify one of plotly's built-in palettes or create a custom one, e.g. atom.palette = ["red", "green", "blue"].

          title_fontsize: int
          Fontsize for the plot's title.
          label_fontsize: int
          Fontsize for the labels, legend and hover information.
          tick_fontsize: int
          Fontsize for the ticks along the plot's axes.
          line_width: int
          Width of the line plots.
          marker_size: int
          Size of the markers.
          @@ -3580,7 +3579,7 @@

          Methods

          available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the pipeline to a sklearn-like object.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.logPrint message and save to log file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.


          -

          method available_models()[source]
          Give an overview of the available predefined models.

          +

          method available_models()[source]
          Give an overview of the available predefined models.

          Returnspd.DataFrame
          Information about the available predefined models. Columns @@ -3602,7 +3601,7 @@

          Methods



          -

          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
          Create a figure with multiple plots.

          +

          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
          Create a figure with multiple plots.

          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

          @@ -3647,7 +3646,7 @@

          Methods



        -

        method clear()[source]
        Reset attributes and clear cache from all models.

        +

        method clear()[source]
        Reset attributes and clear cache from all models.

        Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected @@ -3662,7 +3661,7 @@

        Methods

      5. Cached holdout data sets


      6. -

        method delete(models=None)[source]
        Delete models.

        +

        method delete(models=None)[source]
        Delete models.

        If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3673,7 +3672,7 @@

        Methods



        -

        method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
        Get all models' scores for the provided metrics.

        +

        method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
        Get all models' scores for the provided metrics.

        Parametersmetric: str, func, scorer, sequence or None, default=None
        Metric to calculate. If None, it returns an overview of @@ -3700,7 +3699,7 @@

        Methods



        -

        method export_pipeline(model=None, memory=None, verbose=None)[source]
        Export the pipeline to a sklearn-like object.

        +

        method export_pipeline(model=None, memory=None, verbose=None)[source]
        Export the pipeline to a sklearn-like object.

        Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

        @@ -3739,7 +3738,7 @@

        Methods



        -

        method get_class_weight(dataset="train")[source]
        Return class weights for a balanced data set.

        +

        method get_class_weight(dataset="train")[source]
        Return class weights for a balanced data set.

        Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -3764,7 +3763,7 @@

        Methods



        -

        method log(msg, level=0, severity="info")[source]
        Print message and save to log file.

        +

        method log(msg, level=0, severity="info")[source]
        Print message and save to log file.

        Parametersmsg: int, float or str
        Message to save to the logger and print to stdout.

        @@ -3776,7 +3775,7 @@

        Methods



        -

        method merge(other, suffix="2")[source]
        Merge another instance of the same class into this one.

        +

        method merge(other, suffix="2")[source]
        Merge another instance of the same class into this one.

        Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -3794,7 +3793,7 @@

        Methods



        -

        method update_layout(**kwargs)[source]
        Update the properties of the plot's layout.

        +

        method update_layout(**kwargs)[source]
        Update the properties of the plot's layout.

        Recursively update the structure of the original layout with the values in the arguments.

        @@ -3803,7 +3802,7 @@

        Methods



        -

        method update_traces(**kwargs)[source]
        Update the properties of the plot's traces.

        +

        method update_traces(**kwargs)[source]
        Update the properties of the plot's traces.

        Recursively update the structure of the original traces with the values in the arguments.

        @@ -3812,9 +3811,9 @@

        Methods



        -

        method reset_aesthetics()[source]
        Reset the plot aesthetics to their default values.

        +

        method reset_aesthetics()[source]
        Reset the plot aesthetics to their default values.



        -

        method run(*arrays)[source]
        Train and evaluate the models.

        +

        method run(*arrays)[source]
        Train and evaluate the models.

        Read more in the user guide.

        Parameters*arrays: sequence of indexables
        @@ -3827,7 +3826,7 @@

        Methods



        -

        method save(filename="auto", save_data=True)[source]
        Save the instance to a pickle file.

        +

        method save(filename="auto", save_data=True)[source]
        Save the instance to a pickle file.

        Parametersfilename: str, default="auto"
        Name of the file. Use "auto" for automatic naming.

        @@ -3847,7 +3846,7 @@

        Methods



        -

        method stacking(models=None, name="Stack", **kwargs)[source]
        Add a Stacking model to the pipeline.

        +

        method stacking(models=None, name="Stack", **kwargs)[source]
        Add a Stacking model to the pipeline.

        Warning

        Combining models trained on different branches into one @@ -3867,7 +3866,7 @@

        Methods



        -

        method voting(models=None, name="Vote", **kwargs)[source]
        Add a Voting model to the pipeline.

        +

        method voting(models=None, name="Vote", **kwargs)[source]
        Add a Voting model to the pipeline.

        Warning

        Combining models trained on different branches into one @@ -3932,13 +3931,13 @@

        Methods

        - n_jobs: int, default=1
        @@ -3421,17 +3421,16 @@

        TrainSizingForecaster

        follows the SYCL_DEVICE_FILTER filter selector, e.g. device="gpu" to use the GPU. Read more in the user guide.

        -

        engine: dict or None, default=None
        +

        engine: dict, default={"data": "numpy", "estimator": "sklearn"}
        Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

        +corresponding choice as values. Choose from:

        • "data":

            -
          • "numpy" (default)
          • +
          • "numpy"
          • "pyarrow"
          • "modin"
          @@ -3439,7 +3438,7 @@

          TrainSizingForecaster

        • "estimator":

            -
          • "sklearn" (default)
          • +
          • "sklearn"
          • "sklearnex"
          • "cuml"
          @@ -3516,7 +3515,7 @@

          Data attributes

          Updating the dataset will automatically update the response of these attributes accordingly.

          -

          +

          Attributesdataset: dataframe
          Complete data set.
          train: dataframe
          Training set.
          test: dataframe
          Test set.
          X: dataframe
          Feature set.
          y: series | dataframe
          Target column(s).
          X_train: dataframe
          Features of the training set.
          y_train: series | dataframe
          Target column(s) of the training set.
          X_test: dataframe
          Features of the test set.
          y_test: series | dataframe
          Target column(s) of the test set.
          shape: tuple[int, int]
          Shape of the dataset (n_rows, n_columns).
          columns: series
          Name of all the columns.
          n_columns: int
          Number of columns.
          features: series
          Name of the features.
          n_features: int
          Number of features.
          target: str | list[str]
          Name of the target column(s).
          Attributesdataset: dataframe
          Complete data set.
          train: dataframe
          Training set.
          test: dataframe
          Test set.
          X: dataframe
          Feature set.
          y: series | dataframe
          Target column(s).
          X_train: dataframe
          Features of the training set.
          y_train: series | dataframe
          Target column(s) of the training set.
          X_test: dataframe
          Features of the test set.
          y_test: series | dataframe
          Target column(s) of the test set.
          shape: tuple[int, int]
          Shape of the dataset (n_rows, n_columns).
          columns: index
          Name of all the columns.
          n_columns: int
          Number of columns.
          features: index
          Name of the features.
          n_features: int
          Number of features.
          target: str | list[str]
          Name of the target column(s).


          Utility attributes

          @@ -3560,7 +3559,7 @@

          Plot attributes

          The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

          -

          Attributespalette: str | SEQUENCE
          Color palette.

          +

          Attributespalette: str | sequence
          Color palette.

          Specify one of plotly's built-in palettes or create a custom one, e.g. atom.palette = ["red", "green", "blue"].

          title_fontsize: int
          Fontsize for the plot's title.
          label_fontsize: int
          Fontsize for the labels, legend and hover information.
          tick_fontsize: int
          Fontsize for the ticks along the plot's axes.
          line_width: int
          Width of the line plots.
          marker_size: int
          Size of the markers.
          @@ -3572,7 +3571,7 @@

          Methods

          available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the pipeline to a sklearn-like object.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.logPrint message and save to log file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.


          -

          method available_models()[source]
          Give an overview of the available predefined models.

          +

          method available_models()[source]
          Give an overview of the available predefined models.

          Returnspd.DataFrame
          Information about the available predefined models. Columns @@ -3594,7 +3593,7 @@

          Methods



          -

          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
          Create a figure with multiple plots.

          +

          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
          Create a figure with multiple plots.

          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

          @@ -3639,7 +3638,7 @@

          Methods



        -

        method clear()[source]
        Reset attributes and clear cache from all models.

        +

        method clear()[source]
        Reset attributes and clear cache from all models.

        Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected @@ -3654,7 +3653,7 @@

        Methods

      7. Cached holdout data sets


      8. -

        method delete(models=None)[source]
        Delete models.

        +

        method delete(models=None)[source]
        Delete models.

        If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3665,7 +3664,7 @@

        Methods



        -

        method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
        Get all models' scores for the provided metrics.

        +

        method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
        Get all models' scores for the provided metrics.

        Parametersmetric: str, func, scorer, sequence or None, default=None
        Metric to calculate. If None, it returns an overview of @@ -3692,7 +3691,7 @@

        Methods



        -

        method export_pipeline(model=None, memory=None, verbose=None)[source]
        Export the pipeline to a sklearn-like object.

        +

        method export_pipeline(model=None, memory=None, verbose=None)[source]
        Export the pipeline to a sklearn-like object.

        Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

        @@ -3731,7 +3730,7 @@

        Methods



        -

        method get_class_weight(dataset="train")[source]
        Return class weights for a balanced data set.

        +

        method get_class_weight(dataset="train")[source]
        Return class weights for a balanced data set.

        Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -3756,7 +3755,7 @@

        Methods



        -

        method log(msg, level=0, severity="info")[source]
        Print message and save to log file.

        +

        method log(msg, level=0, severity="info")[source]
        Print message and save to log file.

        Parametersmsg: int, float or str
        Message to save to the logger and print to stdout.

        @@ -3768,7 +3767,7 @@

        Methods



        -

        method merge(other, suffix="2")[source]
        Merge another instance of the same class into this one.

        +

        method merge(other, suffix="2")[source]
        Merge another instance of the same class into this one.

        Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -3786,7 +3785,7 @@

        Methods



        -

        method update_layout(**kwargs)[source]
        Update the properties of the plot's layout.

        +

        method update_layout(**kwargs)[source]
        Update the properties of the plot's layout.

        Recursively update the structure of the original layout with the values in the arguments.

        @@ -3795,7 +3794,7 @@

        Methods



        -

        method update_traces(**kwargs)[source]
        Update the properties of the plot's traces.

        +

        method update_traces(**kwargs)[source]
        Update the properties of the plot's traces.

        Recursively update the structure of the original traces with the values in the arguments.

        @@ -3804,9 +3803,9 @@

        Methods



        -

        method reset_aesthetics()[source]
        Reset the plot aesthetics to their default values.

        +

        method reset_aesthetics()[source]
        Reset the plot aesthetics to their default values.



        -

        method run(*arrays)[source]
        Train and evaluate the models.

        +

        method run(*arrays)[source]
        Train and evaluate the models.

        Read more in the user guide.

        Parameters*arrays: sequence of indexables
        @@ -3819,7 +3818,7 @@

        Methods



        -

        method save(filename="auto", save_data=True)[source]
        Save the instance to a pickle file.

        +

        method save(filename="auto", save_data=True)[source]
        Save the instance to a pickle file.

        Parametersfilename: str, default="auto"
        Name of the file. Use "auto" for automatic naming.

        @@ -3839,7 +3838,7 @@

        Methods



        -

        method stacking(models=None, name="Stack", **kwargs)[source]
        Add a Stacking model to the pipeline.

        +

        method stacking(models=None, name="Stack", **kwargs)[source]
        Add a Stacking model to the pipeline.

        Warning

        Combining models trained on different branches into one @@ -3859,7 +3858,7 @@

        Methods



        -

        method voting(models=None, name="Vote", **kwargs)[source]
        Add a Voting model to the pipeline.

        +

        method voting(models=None, name="Vote", **kwargs)[source]
        Add a Voting model to the pipeline.

        Warning

        Combining models trained on different branches into one @@ -4037,4 +4036,4 @@

        Methods

        - \ No newline at end of file + diff --git a/docs/API/training/trainsizingregressor/index.html b/docs/API/training/trainsizingregressor/index.html index 19555c7d4..3410f981e 100644 --- a/docs/API/training/trainsizingregressor/index.html +++ b/docs/API/training/trainsizingregressor/index.html @@ -1164,7 +1164,7 @@
      9. - DirectRegressor + DirectForecaster
      10. @@ -1206,7 +1206,7 @@
      11. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
      12. @@ -1248,7 +1248,7 @@
      13. - TrainSizingRegressor + TrainSizingForecaster
      14. @@ -3322,7 +3322,7 @@

        TrainSizingRegressor


        -

        class atom.training.TrainSizingRegressor(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine=None, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
        Train and evaluate the models in a train sizing fashion.

        +

        class atom.training.TrainSizingRegressor(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors="skip", n_jobs=1, device="cpu", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend="loky", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]
        Train and evaluate the models in a train sizing fashion.

        The following steps are applied to every model (per iteration):

        1. Apply hyperparameter tuning (optional).
        2. @@ -3406,7 +3406,7 @@

          TrainSizingRegressor

        3. "keep": Keep the model in its state at failure. Note that this model can break down many other methods after training. This option is useful to be able to rerun hyperparameter - optimization after failure without losing previous succesfull + optimization after failure without losing previous successful trials.
        n_jobs: int, default=1
        @@ -3421,17 +3421,16 @@

        TrainSizingRegressor

        follows the SYCL_DEVICE_FILTER filter selector, e.g. device="gpu" to use the GPU. Read more in the user guide.

        -

        engine: dict or None, default=None
        +

        engine: dict, default={"data": "numpy", "estimator": "sklearn"}
        Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their -corresponding choice as values. If None, the default options -are selected. Choose from:

        +corresponding choice as values. Choose from:

        • "data":

            -
          • "numpy" (default)
          • +
          • "numpy"
          • "pyarrow"
          • "modin"
          @@ -3439,7 +3438,7 @@

          TrainSizingRegressor

        • "estimator":

            -
          • "sklearn" (default)
          • +
          • "sklearn"
          • "sklearnex"
          • "cuml"
          @@ -3519,7 +3518,7 @@

          Data attributes

          Updating the dataset will automatically update the response of these attributes accordingly.

          -

          +

          Attributesdataset: dataframe
          Complete data set.
          train: dataframe
          Training set.
          test: dataframe
          Test set.
          X: dataframe
          Feature set.
          y: series | dataframe
          Target column(s).
          X_train: dataframe
          Features of the training set.
          y_train: series | dataframe
          Target column(s) of the training set.
          X_test: dataframe
          Features of the test set.
          y_test: series | dataframe
          Target column(s) of the test set.
          shape: tuple[int, int]
          Shape of the dataset (n_rows, n_columns).
          columns: series
          Name of all the columns.
          n_columns: int
          Number of columns.
          features: series
          Name of the features.
          n_features: int
          Number of features.
          target: str | list[str]
          Name of the target column(s).
          Attributesdataset: dataframe
          Complete data set.
          train: dataframe
          Training set.
          test: dataframe
          Test set.
          X: dataframe
          Feature set.
          y: series | dataframe
          Target column(s).
          X_train: dataframe
          Features of the training set.
          y_train: series | dataframe
          Target column(s) of the training set.
          X_test: dataframe
          Features of the test set.
          y_test: series | dataframe
          Target column(s) of the test set.
          shape: tuple[int, int]
          Shape of the dataset (n_rows, n_columns).
          columns: index
          Name of all the columns.
          n_columns: int
          Number of columns.
          features: index
          Name of the features.
          n_features: int
          Number of features.
          target: str | list[str]
          Name of the target column(s).


          Utility attributes

          @@ -3563,7 +3562,7 @@

          Plot attributes

          The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

          -

          Attributespalette: str | SEQUENCE
          Color palette.

          +

          Attributespalette: str | sequence
          Color palette.

          Specify one of plotly's built-in palettes or create a custom one, e.g. atom.palette = ["red", "green", "blue"].

          title_fontsize: int
          Fontsize for the plot's title.
          label_fontsize: int
          Fontsize for the labels, legend and hover information.
          tick_fontsize: int
          Fontsize for the ticks along the plot's axes.
          line_width: int
          Width of the line plots.
          marker_size: int
          Size of the markers.
          @@ -3575,7 +3574,7 @@

          Methods

          available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the pipeline to a sklearn-like object.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.logPrint message and save to log file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.


          -

          method available_models()[source]
          Give an overview of the available predefined models.

          +

          method available_models()[source]
          Give an overview of the available predefined models.

          Returnspd.DataFrame
          Information about the available predefined models. Columns @@ -3597,7 +3596,7 @@

          Methods



          -

          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
          Create a figure with multiple plots.

          +

          method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend="out", figsize=None, filename=None, display=True)[source]
          Create a figure with multiple plots.

          This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

          @@ -3642,7 +3641,7 @@

          Methods



        -

        method clear()[source]
        Reset attributes and clear cache from all models.

        +

        method clear()[source]
        Reset attributes and clear cache from all models.

        Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected @@ -3657,7 +3656,7 @@

        Methods

      15. Cached holdout data sets


      16. -

        method delete(models=None)[source]
        Delete models.

        +

        method delete(models=None)[source]
        Delete models.

        If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from @@ -3668,7 +3667,7 @@

        Methods



        -

        method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
        Get all models' scores for the provided metrics.

        +

        method evaluate(metric=None, dataset="test", threshold=0.5, sample_weight=None)[source]
        Get all models' scores for the provided metrics.

        Parametersmetric: str, func, scorer, sequence or None, default=None
        Metric to calculate. If None, it returns an overview of @@ -3695,7 +3694,7 @@

        Methods



        -

        method export_pipeline(model=None, memory=None, verbose=None)[source]
        Export the pipeline to a sklearn-like object.

        +

        method export_pipeline(model=None, memory=None, verbose=None)[source]
        Export the pipeline to a sklearn-like object.

        Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

        @@ -3734,7 +3733,7 @@

        Methods



        -

        method get_class_weight(dataset="train")[source]
        Return class weights for a balanced data set.

        +

        method get_class_weight(dataset="train")[source]
        Return class weights for a balanced data set.

        Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely @@ -3759,7 +3758,7 @@

        Methods



        -

        method log(msg, level=0, severity="info")[source]
        Print message and save to log file.

        +

        method log(msg, level=0, severity="info")[source]
        Print message and save to log file.

        Parametersmsg: int, float or str
        Message to save to the logger and print to stdout.

        @@ -3771,7 +3770,7 @@

        Methods



        -

        method merge(other, suffix="2")[source]
        Merge another instance of the same class into this one.

        +

        method merge(other, suffix="2")[source]
        Merge another instance of the same class into this one.

        Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix @@ -3789,7 +3788,7 @@

        Methods



        -

        method update_layout(**kwargs)[source]
        Update the properties of the plot's layout.

        +

        method update_layout(**kwargs)[source]
        Update the properties of the plot's layout.

        Recursively update the structure of the original layout with the values in the arguments.

        @@ -3798,7 +3797,7 @@

        Methods



        -

        method update_traces(**kwargs)[source]
        Update the properties of the plot's traces.

        +

        method update_traces(**kwargs)[source]
        Update the properties of the plot's traces.

        Recursively update the structure of the original traces with the values in the arguments.

        @@ -3807,9 +3806,9 @@

        Methods



        -

        method reset_aesthetics()[source]
        Reset the plot aesthetics to their default values.

        +

        method reset_aesthetics()[source]
        Reset the plot aesthetics to their default values.



        -

        method run(*arrays)[source]
        Train and evaluate the models.

        +

        method run(*arrays)[source]
        Train and evaluate the models.

        Read more in the user guide.

        Parameters*arrays: sequence of indexables
        @@ -3822,7 +3821,7 @@

        Methods



        -

        method save(filename="auto", save_data=True)[source]
        Save the instance to a pickle file.

        +

        method save(filename="auto", save_data=True)[source]
        Save the instance to a pickle file.

        Parametersfilename: str, default="auto"
        Name of the file. Use "auto" for automatic naming.

        @@ -3842,7 +3841,7 @@

        Methods



        -

        method stacking(models=None, name="Stack", **kwargs)[source]
        Add a Stacking model to the pipeline.

        +

        method stacking(models=None, name="Stack", **kwargs)[source]
        Add a Stacking model to the pipeline.

        Warning

        Combining models trained on different branches into one @@ -3862,7 +3861,7 @@

        Methods



        -

        method voting(models=None, name="Vote", **kwargs)[source]
        Add a Voting model to the pipeline.

        +

        method voting(models=None, name="Vote", **kwargs)[source]
        Add a Voting model to the pipeline.

        Warning

        Combining models trained on different branches into one @@ -3911,7 +3910,7 @@

        Methods

        @@ -4040,4 +4039,4 @@

        Methods

        - \ No newline at end of file + diff --git a/docs/about/index.html b/docs/about/index.html index 57f5ee7d4..6e5eb5e7d 100644 --- a/docs/about/index.html +++ b/docs/about/index.html @@ -1234,7 +1234,7 @@
      17. - DirectRegressor + DirectForecaster
      18. @@ -1276,7 +1276,7 @@
      19. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
      20. @@ -1318,7 +1318,7 @@
      21. - TrainSizingRegressor + TrainSizingForecaster
      22. diff --git a/docs/changelog/v4.x.x/index.html b/docs/changelog/v4.x.x/index.html index c69b86a7c..ae671a6c2 100644 --- a/docs/changelog/v4.x.x/index.html +++ b/docs/changelog/v4.x.x/index.html @@ -1160,7 +1160,7 @@
      23. - DirectRegressor + DirectForecaster
      24. @@ -1202,7 +1202,7 @@
      25. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
      26. @@ -1244,7 +1244,7 @@
      27. - TrainSizingRegressor + TrainSizingForecaster
      28. diff --git a/docs/changelog/v5.x.x/index.html b/docs/changelog/v5.x.x/index.html index ee3ad4453..d8d38ec30 100644 --- a/docs/changelog/v5.x.x/index.html +++ b/docs/changelog/v5.x.x/index.html @@ -1160,7 +1160,7 @@
      29. - DirectRegressor + DirectForecaster
      30. @@ -1202,7 +1202,7 @@
      31. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
      32. @@ -1244,7 +1244,7 @@
      33. - TrainSizingRegressor + TrainSizingForecaster
      34. @@ -3349,6 +3349,7 @@

        Version 6.0.0

      35. Fixed a bug where the register method failed in Databricks.
      36. Fixed a bug where tuning hyperparameter for a base_estimator inside a custom meta-estimator would fail.
      37. +
      38. Fixed a bug where the data properties' @setter could fail for numpy arrays.
      39. Version 5.2.0

        diff --git a/docs/contributing/index.html b/docs/contributing/index.html index 95e43c0da..5693db553 100644 --- a/docs/contributing/index.html +++ b/docs/contributing/index.html @@ -1160,7 +1160,7 @@
      40. - DirectRegressor + DirectForecaster
      41. @@ -1202,7 +1202,7 @@
      42. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
      43. @@ -1244,7 +1244,7 @@
      44. - TrainSizingRegressor + TrainSizingForecaster
      45. diff --git a/docs/dependencies/index.html b/docs/dependencies/index.html index 5a57b0147..fa43008ce 100644 --- a/docs/dependencies/index.html +++ b/docs/dependencies/index.html @@ -1160,7 +1160,7 @@
      46. - DirectRegressor + DirectForecaster
      47. @@ -1202,7 +1202,7 @@
      48. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
      49. @@ -1244,7 +1244,7 @@
      50. - TrainSizingRegressor + TrainSizingForecaster
      51. @@ -3332,6 +3332,7 @@

        Required

      52. scipy (>=1.10.1)
      53. shap (>=0.42.1)
      54. sktime (>=0.20.1)
      55. +
      56. typeguard (>=4.1.3)
      57. zoofs (>=0.1.26)
      58. Optional

        @@ -3349,7 +3350,7 @@

        Optional

      59. schemdraw (>=0.16)
      60. wordcloud (>=1.9.2)
      61. xgboost (>=1.7.4)
      62. -
      63. ydata-profiling (>=4.3.1)
      64. +
      65. ydata-profiling (>=4.5.1)
      66. Development

        The development dependencies are not installed with the package, and are diff --git a/docs/faq/index.html b/docs/faq/index.html index 1c882fd34..2373ea1cf 100644 --- a/docs/faq/index.html +++ b/docs/faq/index.html @@ -1160,7 +1160,7 @@

      67. - DirectRegressor + DirectForecaster
      68. @@ -1202,7 +1202,7 @@
      69. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
      70. @@ -1244,7 +1244,7 @@
      71. - TrainSizingRegressor + TrainSizingForecaster
      72. diff --git a/docs/getting_started/index.html b/docs/getting_started/index.html index a4cdcc900..37fdd1d4a 100644 --- a/docs/getting_started/index.html +++ b/docs/getting_started/index.html @@ -1208,7 +1208,7 @@
      73. - DirectRegressor + DirectForecaster
      74. @@ -1250,7 +1250,7 @@
      75. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
      76. @@ -1292,7 +1292,7 @@
      77. - TrainSizingRegressor + TrainSizingForecaster
      78. diff --git a/docs/index.html b/docs/index.html index 6aad95952..756473fcf 100644 --- a/docs/index.html +++ b/docs/index.html @@ -1629,7 +1629,7 @@

        Read our stories

      79. - DirectRegressor + DirectForecaster
      80. @@ -1671,7 +1671,7 @@

        Read our stories

      81. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
      82. @@ -1713,7 +1713,7 @@

        Read our stories

      83. - TrainSizingRegressor + TrainSizingForecaster
      84. diff --git a/docs/license/index.html b/docs/license/index.html index bc0b70a4a..1bc96abbe 100644 --- a/docs/license/index.html +++ b/docs/license/index.html @@ -1158,7 +1158,7 @@
      85. - DirectRegressor + DirectForecaster
      86. @@ -1200,7 +1200,7 @@
      87. - SuccessiveHalvingRegressor + SuccessiveHalvingForecaster
      88. @@ -1242,7 +1242,7 @@
      89. - TrainSizingRegressor + TrainSizingForecaster
      90. diff --git a/docs/scripts/autodocs.py b/docs/scripts/autodocs.py index a0a69edac..65043e36e 100644 --- a/docs/scripts/autodocs.py +++ b/docs/scripts/autodocs.py @@ -50,6 +50,7 @@ study="https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html", optimize="https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize", trial="https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html", + frozentrial="https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.FrozenTrial.html", normal="https://github.com/sktime/sktime/blob/b29e147b54959a53cc96e5be9c3f819717aa38e7/sktime/proba/normal.py#L13", forecastinghorizon="https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.forecasting.base.ForecastingHorizon.html#sktime.forecasting.base.ForecastingHorizon", interface="https://gradio.app/docs/#interface", @@ -787,7 +788,6 @@ def render(markdown: str, **kwargs) -> str: Modified markdown/html source text of page. """ - autodocs = None while match := re.search("(:: )(\w.*?)(?=::|\n\n|\Z)", markdown, re.S): command = yaml.safe_load(match.group(2)) @@ -854,15 +854,19 @@ def types_conversion(dtype: str) -> str: """ types = { "CustomDict": "dict", + "BOOL": "bool", "INT": "int", "FLOAT": "float", + "INDEX": "index", "SERIES": "series", + "SEQUENCE": "sequence", "DATAFRAME": "dataframe", "PANDAS": "series | dataframe", - "Branch": "[Branch][branches]", - "Model": "[model][models]", + "BRANCH": "[Branch][branches]", + "MODEL": "[model][models]", "Study": "[Study][]", "Trial": "[Trial][]", + "FrozenTrial": "[FrozenTrial][]", "Normal": "[Normal][]", } diff --git a/docs/search/search_index.json b/docs/search/search_index.json index 1eca19903..2e0614573 100644 --- a/docs/search/search_index.json +++ b/docs/search/search_index.json @@ -1 +1 @@ -{"config": {"lang": ["en"], "separator": "[\\s\\-]+", "pipeline": ["stopWordFilter"]}, "docs": [{"location": "about/", "title": "About", "text": ""}, {"location": "about/#what-is-it", "title": "What is it?", "text": "

        Automated Tool for Optimized Modelling (ATOM) is an open-source Python package designed to help data scientists fasten up the exploration phase of their machine learning projects. ATOM is a low-code, easy-to-use library, capable of running experiments quickly and efficiently, enabling the user to go from raw data to generating insights in just a few lines of code. Click here to get started.

        "}, {"location": "about/#what-can-i-do-with-it", "title": "What can I do with it?", "text": "

        ATOM is an end-to-end solution for machine learning pipelines. It supports the user from raw data ingestion to the final results' analysis and model deployment. Click on the icons to read more about its main functionalities.

        Data cleaning Feature engineering Model selection Hyperparametertuning Model training Model predictions Experiment logging Analysis &Interpretability"}, {"location": "about/#who-is-it-intended-for", "title": "Who is it intended for?", "text": ""}, {"location": "about/#citing-atom", "title": "Citing ATOM", "text": "

        If you use ATOM in a scientific publication, please consider citing this documentation page as the resource. ATOM\u2019s first stable release v2.0.3 was made publicly available in November 2019. A formatted version of the citation would look like this:

        ATOM v2.0.3, November 2019. URL https://tvdboom.github.io/ATOM/

        BibTeX entry:

        @Manual{ATOM,\n    title = {ATOM: A Python package for fast exploration of machine learning pipelines},\n    author = {Mavs},\n    year={2019},\n    mont={November},\n    note = {ATOM version 2.0.3},\n    url = {https://tvdboom.github.io/ATOM/},\n}\n

        "}, {"location": "about/#support", "title": "Support", "text": "

        ATOM recognizes the support from JetBrains by providing core project contributors with a set of developer tools free of charge.

        "}, {"location": "about/#integrations", "title": "Integrations", "text": ""}, {"location": "contributing/", "title": "Contributing", "text": "

        Are you interested in contributing to ATOM? Do you want to report a bug? Do you have a question? Before you do, please read the following guidelines.

        "}, {"location": "contributing/#submission-context", "title": "Submission context", "text": ""}, {"location": "contributing/#question-or-problem", "title": "Question or problem?", "text": "

        For quick questions there's no need to open an issue. Check first if the question isn't already answered on the FAQ section. If not, reach us through the discussions page or on the slack channel.

        "}, {"location": "contributing/#report-a-bug", "title": "Report a bug?", "text": "

        If you found a bug in the source code, you can help by submitting an issue to the issue tracker in the GitHub repository. Even better, you can submit a Pull Request with a fix. However, before doing so, please read the submission guidelines.

        "}, {"location": "contributing/#missing-a-feature", "title": "Missing a feature?", "text": "

        You can request a new feature by submitting an issue to the GitHub Repository. If you would like to implement a new feature, please submit an issue with a proposal for your work first. Please consider what kind of change it is:

        "}, {"location": "contributing/#project-layout", "title": "Project layout", "text": "

        The latest stable release of ATOM is on the master branch, whereas the latest version of ATOM in development is on the development branch. Make sure you are looking at and working on the correct branch if you're looking to contribute code.

        In terms of directory structure:

        Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch.

        "}, {"location": "contributing/#submission-guidelines", "title": "Submission guidelines", "text": ""}, {"location": "contributing/#submitting-an-issue", "title": "Submitting an issue", "text": "

        Before you submit an issue, please search the issue tracker, maybe an issue for your problem already exists and the discussion might inform you of workarounds readily available.

        We want to fix all the issues as soon as possible, but before fixing a bug we need to reproduce and confirm it. In order to reproduce bugs we will systematically ask you to provide a minimal reproduction scenario using the custom issue template.

        "}, {"location": "contributing/#submitting-a-pull-request", "title": "Submitting a pull request", "text": "

        Before you submit a pull request, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes.

        If your contribution requires a new library dependency:

        After submitting your pull request, GitHub will automatically run the tests on your changes and make sure that the updated code builds successfully. The checks are run on Python 3.8, 3.9, 3.10 and 3.11, on Ubuntu and Windows. We also use services that automatically check code style and test coverage.

        "}, {"location": "dependencies/", "title": "Dependencies", "text": ""}, {"location": "dependencies/#python-os", "title": "Python & OS", "text": "

        As of the moment, ATOM supports the following Python versions:

        And operating systems:

        "}, {"location": "dependencies/#packages", "title": "Packages", "text": ""}, {"location": "dependencies/#required", "title": "Required", "text": "

        ATOM is built on top of several existing Python libraries. These packages are necessary for its correct functioning.

        "}, {"location": "dependencies/#optional", "title": "Optional", "text": "

        Some specific models, utility methods or plots require the installation of additional libraries. You can install all the optional dependencies using pip install atom-ml[full]. Doing so also installs the following libraries:

        "}, {"location": "dependencies/#development", "title": "Development", "text": "

        The development dependencies are not installed with the package, and are not required for any of its functionalities. These libraries are only necessary to contribute to the project. Install them using pip install atom-ml[dev].

        Linting

        Testing

        Documentation

        "}, {"location": "faq/", "title": "Frequently asked questions", "text": "

        Here we try to give answers to some questions that have popped up regularly. If you have any other questions, don't hesitate to create a new discussion or post them on the Slack channel!

        ??? faq Is this package related to the Atom text editor?\" There is, indeed, a text editor with the same name and a similar logo as this package. Is this a shameless copy? No. When I started the project, I didn't know about the text editor, and it doesn't require much thinking to come up with the idea of replacing the letter O of the word atom with the image of an atom.

        How does ATOM relate to AutoML?

        ATOM is not an AutoML tool since it does not automate the search for an optimal pipeline like well known AutoML tools such as auto-sklearn or EvalML do. Instead, ATOM helps the user find the optimal pipeline himself. One of the goals of this package is to help data scientists produce explainable pipelines, and using an AutoML black box function would impede that. That said, it is possible to integrate a EvalML pipeline with atom through the automl method.

        Is it possible to run deep learning models?

        Yes. Deep learning models can be added as custom models to the pipeline as long as they follow sklearn's API. For more information, see the deep learning section of the user guide.

        Can I run atom's methods on just a subset of the columns?

        Yes, all data cleaning and feature engineering methods accept a columns parameter to only transform the selected features. For example, to only impute the numerical columns in the dataset we could type atom.impute(strat_num=\"mean\", columns=atom.numerical). The parameter accepts column names, column indices, dtypes or a slice object.

        How can I compare the same model on different datasets?

        In many occasions you might want to test how a model performs on datasets processed with different pipelines. For this, atom has the branch system. Create a new branch for every new pipeline you want to test and use the plot methods to compare all models, independent of the branch it was trained on.

        Can I train models through atom using a GPU?

        Yes. Refer to the user guide to see what algorithms and models have a GPU implementation. Be aware that it could require additional software and hardware dependencies.

        How are numerical and categorical columns differentiated?

        The columns are separated using a dataframe's select_dtypes method. Numerical columns are selected using include=\"number\" whereas categorical columns are selected using exclude=\"number\".

        Can I run unsupervised learning pipelines?

        No. As for now, ATOM only supports supervised machine learning pipelines. However, various unsupervised algorithms can be chosen as strategy in the Pruner class to detect and remove outliers from the dataset.

        Is there a way to plot multiple models in the same shap plot?

        No. Unfortunately, there is no way to plot multiple models in the same shap plot since the plots are made by the shap package and passed as matplotlib.axes objects to atom. This means that it's not within the reach of this package to implement such a utility.

        Can I merge a sklearn pipeline with atom?

        Yes. Like any other transformer, it is possible to add a sklearn pipeline to atom using the add method. Every transformer in the pipeline is merged independently. The pipeline is not allowed to end with a model since atom manages its own models. If that is the case, add the pipeline using atom.add(pipeline[:-1]).

        Is it possible to initialize atom with an existing train and test set?

        Yes. If you already have a separated train and test set you can initialize atom in two ways:

        Make sure the train and test size have the same number of columns! If atom is initialized in any of these two ways, the test_size parameter is ignored.

        Can I train the models using cross-validation?

        Applying cross-validation means transforming every step of the pipeline multiple times, each with different results. Doing this would prevent ATOM from being able to show the transformation results after every pre-processing step, which means losing the ability to inspect how a transformer changed the dataset. For this reason, it is not possible to apply cross-validation until after a model has been trained. After a model has been trained, the pipeline is defined, and cross-validation can be applied using the cross_validate method. See here an example using cross-validation.

        Is there a way to process datetime features?

        Yes, the FeatureExtractor class can automatically extract useful features (day, month, year, etc...) from datetime columns. The extracted features are always encoded to numerical values, so they can be fed directly to a model.

        "}, {"location": "getting_started/", "title": "Getting started", "text": ""}, {"location": "getting_started/#installation", "title": "Installation", "text": "

        Install ATOM's newest release easily via pip:

        pip install -U atom-ml\n

        or via conda:

        conda install -c conda-forge atom-ml\n

        Note

        Since atom was already taken, download the package under the name atom-ml!

        Warning

        ATOM makes use of many other ML libraries, making its dependency list quite long. Because of that, the installation may take longer than you are accustomed to. Be patient!

        Optional dependencies

        Some specific models, utility methods or plots require the installation of additional libraries. To install the optional dependencies, add [full] after the package's name.

        pip install -U atom-ml[full]\n

        Latest source

        Sometimes, new features and bug fixes are already implemented in the development branch, but waiting for the next release to be made available. If you can't wait for that, it's possible to install the package directly from git.

        pip install git+https://github.com/tvdboom/ATOM.git@development#egg=atom-ml\n

        Don't forget to include #egg=atom-ml to explicitly name the project, this way pip can track metadata for it without having to have run the setup.py script.

        Contributing

        If you are planning to contribute to the project, you'll need the development dependencies. Install them adding [dev] after the package's name.

        pip install -U atom-ml[dev]\n

        Click here for a complete list of package files for all versions published on PyPI.

        "}, {"location": "getting_started/#usage", "title": "Usage", "text": "

        ATOM contains a variety of classes and functions to perform data cleaning, feature engineering, model training, plotting and much more. The easiest way to use everything ATOM has to offer is through one of the main classes:

        Let's walk you through an example. Click on the SageMaker Studio Lab badge on top of this section to run this example yourself.

        Make the necessary imports and load the data.

        import pandas as pd\nfrom atom import ATOMClassifier\n\n# Load the Australian Weather dataset\nX = pd.read_csv(\"./examples/datasets/weatherAUS.csv\", nrows=100)\nprint(X.head())\n

        Initialize the ATOMClassifier or ATOMRegressor class. These two classes are convenient wrappers for the whole machine learning pipeline. Contrary to sklearn's API, they are initialized providing the data you want to manipulate.

        import pandas as pd  # hide\nfrom atom import ATOMClassifier  # hide\nX = pd.read_csv(\"./examples/datasets/weatherAUS.csv\", nrows=100)  # hide\n\natom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)\n

        Data transformations are applied through atom's methods. For example, calling the impute method will initialize an Imputer instance, fit it on the training set and transform the whole dataset. The transformations are applied immediately after calling the method (no fit and transform commands necessary).

        import pandas as pd  # hide\nfrom atom import ATOMClassifier  # hide\nX = pd.read_csv(\"./examples/datasets/weatherAUS.csv\", nrows=100)  # hide\n\natom = ATOMClassifier(X, y=\"RainTomorrow\")  # hide\natom.verbose = 2  # hide\n\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  \natom.encode(strategy=\"Target\", max_onehot=8)\n

        Similarly, models are trained and evaluated using the run method. Here, we fit both a LogisticRegression and LinearDiscriminantAnalysis model, and apply hyperparameter tuning.

        import pandas as pd  # hide\nfrom atom import ATOMClassifier  # hide\nX = pd.read_csv(\"./examples/datasets/weatherAUS.csv\", nrows=100)  # hide\n\natom = ATOMClassifier(X, y=\"RainTomorrow\")  # hide\n\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  # hide \natom.encode(strategy=\"Target\", max_onehot=8)  # hide\natom.verbose = 2  # hide\n\natom.run(models=[\"LR\", \"LDA\"], metric=\"auc\", n_trials=6)\n

        And lastly, analyze the results.

        import pandas as pd  # hide\nfrom atom import ATOMClassifier  # hide\nX = pd.read_csv(\"./examples/datasets/weatherAUS.csv\", nrows=100)  # hide\n\natom = ATOMClassifier(X, y=\"RainTomorrow\")  # hide\n\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  # hide \natom.encode(strategy=\"Target\", max_onehot=8)  # hide\n\natom.run(models=[\"LR\", \"LDA\"], metric=\"auc\", n_trials=6)  # hide\n\nprint(atom.evaluate())\n\natom.plot_lift()\n
        "}, {"location": "license/", "title": "MIT License", "text": "

        Copyright \u00a9 2023 Mavs

        Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

        The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

        THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

        "}, {"location": "API/ATOM/atomclassifier/", "title": "ATOMClassifier", "text": "

        class atom.api.ATOMClassifier(*arrays, y=-1, index=False, shuffle=True, stratify=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine=None, backend=\"loky\", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for classification tasks.

        Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.

        All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.

        Parameters*arrays: sequence of indexables Dataset containing features and target. Allowed formats are:

        X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str or sequence Target column corresponding to X.

        y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to X.

        This parameter is ignored if the target column is provided through arrays.

        index: bool, int, str or sequence, default=False Handle the index in the resulting dataframe.

        test_size: int or float, default=0.2

        This parameter is ignored if the test set is provided through arrays.

        holdout_size: int, float or None, default=None

        This parameter is ignored if the holdout set is provided through arrays.

        shuffle: bool, default=True Whether to shuffle the dataset before splitting the train and test set. Be aware that not shuffling the dataset can cause an unequal distribution of target classes over the sets.

        stratify: bool, int, str or sequence, default=True Handle stratification of the target classes over the data sets.

        This parameter is ignored if shuffle=False or if the test set is provided through arrays.

        For multioutput tasks, stratification is applied to the joint target columns.

        n_rows: int or float, default=1 Random subsample of the dataset to use. The default value selects all rows.

        n_jobs: int, default=1 Number of cores to use for parallel processing.

        device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

        engine: dict or None, default=None Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. If None, the default options are selected. Choose from:

        backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

        verbose: int, default=0 Verbosity level of the class. Choose from:

        warnings: bool or str, default=False

        Changing this parameter affects the PYTHONWARNINGS environment. ATOM can't manage warnings that go from C/C++ code to stdout.

        logger: str, Logger or None, default=None

        experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

        random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

        See Also

        ATOMForecaster Main class for forecasting tasks.

        ATOMRegressor Main class for regression tasks.

        "}, {"location": "API/ATOM/atomclassifier/#example", "title": "Example", "text": "
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n# Initialize atom\natom = ATOMClassifier(X, y, verbose=2)\n\n# Apply data cleaning and feature engineering methods\natom.balance(strategy=\"smote\")\natom.feature_selection(strategy=\"rfe\", solver=\"lr\", n_features=22)\n\n# Train models\natom.run(models=[\"LR\", \"RF\", \"XGB\"])\n\n# Analyze the results\nprint(atom.results)\n\nprint(atom.evaluate())\n
        "}, {"location": "API/ATOM/atomclassifier/#magic-methods", "title": "Magic methods", "text": "

        The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.

        "}, {"location": "API/ATOM/atomclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomclassifier/#data-attributes", "title": "Data attributes", "text": "

        The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

        Attributespipeline: pd.SeriesTransformers fitted on the data.

        Use this attribute only to access the individual instances. To visualize the pipeline, use the plot_pipeline method.mapping: dictEncoded values and their respective mapped values.

        The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, etc...).dataset: dataframeComplete data set.train: dataframeTraining set.test: dataframeTest set.X: dataframeFeature set.y: series | dataframeTarget column(s).X_train: dataframeFeatures of the training set.y_train: series | dataframeTarget column(s) of the training set.X_test: dataframeFeatures of the test set.y_test: series | dataframeTarget column(s) of the test set.shape: tuple[int, int]Shape of the dataset (n_rows, n_columns).columns: seriesName of all the columns.n_columns: intNumber of columns.features: seriesName of the features.n_features: intNumber of features.target: str | list[str]Name of the target column(s).scaled: boolWhether the feature set is scaled.

        A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only 0s and 1s) are excluded from the calculation.duplicates: seriesNumber of duplicate rows in the dataset.missing: listValues that are considered \"missing\".

        These values are used by the clean and impute methods. Default values are: None, NaN, NaT, +inf, -inf, \"\", \"?\", \"None\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"inf\". Note that None, NaN, +inf and -inf are always considered missing since they are incompatible with sklearn estimators.nans: series | NoneColumns with the number of missing values in them.n_nans: int | NoneNumber of samples containing missing values.numerical: seriesNames of the numerical features in the dataset.n_numerical: intNumber of numerical features in the dataset.categorical: seriesNames of the categorical features in the dataset.n_categorical: intNumber of categorical features in the dataset.outliers: series | NoneColumns in training set with amount of outlier values.n_outliers: int | NoneNumber of samples in the training set containing outliers.classes: pd.DataFrame | NoneDistribution of target classes per data set.n_classes: int | series | NoneNumber of classes in the target column(s).

        "}, {"location": "API/ATOM/atomclassifier/#utility-attributes", "title": "Utility attributes", "text": "

        The utility attributes are used to access information about the models in the instance after training.

        Attributesbranch: BranchCurrent active branch.

        Use the property's @setter to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use __from__ to split the new branch from any other existing branch. Read more in the user guide.models: str | list[str] | NoneName of the model(s).metric: str | list[str] | NoneName of the metric(s).winners: list[model] | NoneModels ordered by performance.

        Performance is measured as the highest score on the model's score_bootstrap or score_test attributes, checked in that order. For multi-metric runs, only the main metric is compared. Ties are resolved looking at the lowest time_fit.winner: model | NoneBest performing model.

        Performance is measured as the highest score on the model's score_bootstrap or score_test attributes, checked in that order. For multi-metric runs, only the main metric is compared. Ties are resolved looking at the lowest time_fit.results: pd.DataFrameOverview of the training results.

        All durations are in seconds. Columns include:

        "}, {"location": "API/ATOM/atomclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "

        The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

        Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning.log_model: boolWhether to save the model's estimator after fitting.log_plots: boolWhether to save plots as artifacts.log_data: boolWhether to save the train and test sets.log_pipeline: boolWhether to save the model's pipeline.

        "}, {"location": "API/ATOM/atomclassifier/#plot-attributes", "title": "Plot attributes", "text": "

        The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

        Attributespalette: str | SEQUENCEColor palette.

        Specify one of plotly's built-in palettes or create a custom one, e.g. atom.palette = [\"red\", \"green\", \"blue\"].title_fontsize: intFontsize for the plot's title.label_fontsize: intFontsize for the labels, legend and hover information.tick_fontsize: intFontsize for the ticks along the plot's axes.line_width: intWidth of the line plots.marker_size: intSize of the markers.

        "}, {"location": "API/ATOM/atomclassifier/#utility-methods", "title": "Utility methods", "text": "

        Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.

        addAdd a transformer to the pipeline.applyApply a function to the dataset.automlSearch for an optimized pipeline in an automated fashion.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the pipeline to a sklearn-like object.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoads an atom instance from a pickle file.logPrint message and save to log file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a .csv file.shrinkConverts the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.

        method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.

        If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.

        Warning

        Note

        If the transform method doesn't return a dataframe:

        Note

        If the transformer has a n_jobs and/or random_state parameter that is left to its default value, it adopts atom's value.

        Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a transform method.

        columns: int, str, slice, sequence or None, default=None Names, indices or dtypes of the columns in the dataset to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns. Add ! in front of a name or dtype to exclude that column, e.g. atom.add(Transformer(), columns=\"!Location\")transforms all columns exceptLocation`. You can either include or exclude columns, not combinations of these.

        train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data.

        **fit_params Additional keyword arguments for the transformer's fit method.

        method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.

        The function should have signature func(dataset, **kw_args) -> dataset. This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...

        Note

        This approach is preferred over changing the dataset directly through the property's @setter since the transformation is stored in the pipeline.

        Tip

        Use atom.apply(lambda df: df.drop(\"column_name\",axis=1)) to store the removal of columns in the pipeline.

        Parametersfunc: callable Function to apply.

        inverse_func: callable or None, default=None Inverse function of func. If None, the inverse_transform method returns the input unchanged.

        kw_args: dict or None, default=None Additional keyword arguments for the function.

        inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function.

        method automl(**kwargs)[source]Search for an optimized pipeline in an automated fashion.

        Automated machine learning (AutoML) automates the selection, composition and parameterization of machine learning pipelines. Automating the machine learning often provides faster, more accurate outputs than hand-coded algorithms. ATOM uses the evalML package for AutoML optimization. The resulting transformers and final estimator are merged with atom's pipeline (check the pipeline and models attributes after the method finishes running). The created AutoMLSearch instance can be accessed through the evalml attribute.

        Warning

        AutoML algorithms aren't intended to run for only a few minutes. The method may need a very long time to achieve optimal results.

        Parameters**kwargs Additional keyword arguments for the AutoMLSearch instance.

        method available_models()[source]Give an overview of the available predefined models.

        Returnspd.DataFrame Information about the available predefined models. Columns include:

        method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

        This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

        Parametersrows: int, default=1 Number of plots in length.

        cols: int, default=2 Number of plots in width.

        horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

        vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

        title: str, dict or None, default=None Title for the plot.

        legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

        figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

        filename: str or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

        display: bool, default=True Whether to render the plot.

        Yieldsgo.Figure Plot object.

        method clear()[source]Reset attributes and clear cache from all models.

        Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

        method delete(models=None)[source]Delete models.

        If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

        Parametersmodels: int, str, slice, Model, sequence or None, default=None Models to delete. If None, all models are deleted.

        method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.

        Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

        Tip

        Use the plot_distribution method to plot a column's distribution.

        Parametersdistributions: str, sequence or None, default=None Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

        columns: int, str, slice, sequence or None, default=None Names, positions or dtypes of the columns in the dataset to perform the test on. If None, select all numerical columns.

        Returnspd.DataFrame Statistic results with multiindex levels:

        method eda(dataset=\"dataset\", n_rows=None, filename=None, **kwargs)[source]Create an Exploratory Data Analysis report.

        ATOM uses the ydata-profiling package for the EDA. The report is rendered directly in the notebook. The created ProfileReport instance can be accessed through the report attribute.

        Warning

        This method can be slow for large datasets.

        Parametersdataset: str, default=\"dataset\" Data set to get the report from.

        n_rows: int or None, default=None Number of (randomly picked) rows to process. None to use all rows.

        filename: str or None, default=None Name to save the file with (as .html). None to not save anything.

        **kwargs Additional keyword arguments for the ProfileReport instance.

        method evaluate(metric=None, dataset=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

        Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

        dataset: str, default=\"test\" Data set on which to calculate the metric. Choose from: \"train\", \"test\" or \"holdout\".

        threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

        For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

        sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

        Returnspd.DataFrame Scores of the models.

        method export_pipeline(model=None, memory=None, verbose=None)[source]Export the pipeline to a sklearn-like object.

        Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

        Info

        The returned pipeline behaves similarly to sklearn's Pipeline, and additionally:

        Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

        memory: bool, str, Memory or None, default=None Used to cache the fitted transformers of the pipeline. - If None or False: No caching is performed. - If True: A default temp directory is used. - If str: Path to the caching directory. - If Memory: Object with the joblib.Memory interface.

        verbose: int or None, default=None Verbosity level of the transformers in the pipeline. If None, it leaves them to their original verbosity. Note that this is not the pipeline's own verbose parameter. To change that, use the set_params method.

        ReturnsPipeline Current branch as a sklearn-like Pipeline object.

        method get_class_weight(dataset=\"train\")[source]Return class weights for a balanced data set.

        Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected data set.

        Parametersdataset: str, default=\"train\" Data set from which to get the weights. Choose from: \"train\", \"test\", \"dataset\".

        Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

        method get_sample_weight(dataset=\"train\")[source]Return sample weights for a balanced data set.

        The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

        Parametersdataset: str, default=\"train\" Data set from which to get the weights. Choose from: \"train\", \"test\", \"dataset\".

        Returnsseries Sequence of weights with shape=(n_samples,).

        method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

        Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.

        ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

        y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to X.

        verbose: int or None, default=None Verbosity level for the transformers. If None, it uses the transformer's own verbosity.

        Returnsdataframe Original feature set. Only returned if provided.

        series Original target column. Only returned if provided.

        function atom.atom.load(filename, data=None, transform_data=True, verbose=None)[source]Loads an atom instance from a pickle file.

        If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

        Note

        The loaded instance's current branch is the same branch as it was when saved.

        Parametersfilename: str Name of the pickle file.

        data: sequence of indexables or None, default=None Original dataset. Only use this parameter if the loaded file was saved using save_data=False. Allowed formats are:

        X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str or sequence Target column corresponding to X.

        transform_data: bool, default=True If False, the data is left as provided. If True, it's transformed through all the steps in the loaded instance's pipeline.

        verbose: int or None, default=None Verbosity level of the transformations applied on the new data. If None, use the verbosity from the loaded instance. This parameter is ignored if transform_data=False.

        Returnsatom instance Unpickled atom instance.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

        Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

        Parametersother: Runner Instance with which to merge. Should be of the same class as self.

        suffix: str, default=\"2\" Conflicting branches and models are merged adding suffix to the end of their names.

        method update_layout(**kwargs)[source]Update the properties of the plot's layout.

        Recursively update the structure of the original layout with the values in the arguments.

        Parameters**kwargs Keyword arguments for the figure's update_layout method.

        method update_traces(**kwargs)[source]Update the properties of the plot's traces.

        Recursively update the structure of the original traces with the values in the arguments.

        Parameters**kwargs Keyword arguments for the figure's update_traces method.

        method reset()[source]Reset the instance to it's initial state.

        Deletes all branches and models. The dataset is also reset to its form after initialization.

        method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method save_data(filename=\"auto\", dataset=\"dataset\", **kwargs)[source]Save the data in the current branch to a .csv file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        dataset: str, default=\"dataset\" Data set to save.

        **kwargs Additional keyword arguments for pandas' to_csv method.

        method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Converts the columns to the smallest possible matching dtype.

        Parametersint2bool: bool, default=False Whether to convert int columns to bool type. Only if the values in the column are strictly in (0, 1) or (-1, 1).

        int2uint: bool, default=False Whether to convert int to uint (unsigned integer). Only if the values in the column are strictly positive.

        str2cat: bool, default=False Whether to convert string to category. Only if the number of categories would be less than 30% of the length of the column.

        dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

        columns: int, str, slice, sequence or None, default=None Names, positions or dtypes of the columns in the dataset to shrink. If None, transform all columns.

        method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

        Warning

        Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

        Parametersmodels: slice, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

        name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

        **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

        method stats(_vb=-2)[source]Display basic information about the dataset.

        Parameters_vb: int, default=-2 Internal parameter to always print if called by user.

        method status()[source]Get an overview of the branches and models.

        This method prints the same information as the __repr__ and also saves it to the logger.

        method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

        Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

        y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to X.

        verbose: int or None, default=None Verbosity level for the transformers. If None, it uses the transformer's own verbosity.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

        Warning

        Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

        Parametersmodels: slice, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

        name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

        **kwargs Additional keyword arguments for sklearn's voting instance.

        "}, {"location": "API/ATOM/atomclassifier/#data-cleaning", "title": "Data cleaning", "text": "

        The data cleaning methods can help you scale the data, handle missing values, categorical columns, outliers and unbalanced datasets. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.

        Tip

        Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.

        balanceBalance the number of rows per class in the target column.cleanApplies standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.

        method balance(strategy=\"adasyn\", **kwargs)[source]Balance the number of rows per class in the target column.

        When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the N-th sample in the data set.

        See the Balancer class for a description of the parameters.

        Note

        Tip

        Use atom's classes attribute for an overview of the target class distribution per data set.

        method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Applies standard data cleaning steps on the dataset.

        Use the parameters to choose which transformations to perform. The available steps are:

        See the Cleaner class for a description of the parameters.

        method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.

        For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

        See the Discretizer class for a description of the parameters.

        Tip

        Use the plot_distribution method to visualize a column's distribution and decide on the bins.

        method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.

        The encoding type depends on the number of classes in the column:

        Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.

        See the Encoder class for a description of the parameters.

        Note

        This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.

        Tip

        Use the categorical attribute for a list of the categorical features in the dataset.

        method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.

        Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

        See the Imputer class for a description of the parameters.

        Tip

        Use the nans attribute to check the amount of missing values per column.

        method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

        This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.

        See the Normalizer class for a description of the parameters.

        Tip

        Use the plot_distribution method to examine a column's distribution.

        method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.

        Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

        See the Pruner class for a description of the parameters.

        Note

        This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.

        Tip

        Use the outliers attribute to check the number of outliers per column.

        method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.

        Apply one of sklearn's scalers. Categorical columns are ignored.

        See the Scaler class for a description of the parameters.

        Tip

        Use the scaled attribute to check whether the dataset is scaled.

        "}, {"location": "API/ATOM/atomclassifier/#nlp", "title": "NLP", "text": "

        The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called corpus. Read more in the user guide.

        textcleanApplies standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.

        method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Applies standard text cleaning to the corpus.

        Transformations include normalizing characters and dropping noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

        See the TextCleaner class for a description of the parameters.

        method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.

        Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

        See the TextNormalizer class for a description of the parameters.

        method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.

        Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g. \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

        See the Tokenizer class for a description of the parameters.

        method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.

        Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

        If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

        See the Vectorizer class for a description of the parameters.

        "}, {"location": "API/ATOM/atomclassifier/#feature-engineering", "title": "Feature engineering", "text": "

        To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.

        feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.

        method feature_extraction(features=['day', 'month', 'year'], fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.

        Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

        See the FeatureExtractor class for a description of the parameters.

        method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.

        Create new combinations of existing features to capture the non-linear relations between the original features.

        See the FeatureGenerator class for a description of the parameters.

        method feature_grouping(group, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.

        Replace groups of features with related characteristics with new features that summarize statistical properties of te group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

        See the FeatureGrouper class for a description of the parameters.

        method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.

        Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low variance features.

        See the FeatureSelector class for a description of the parameters.

        Note

        "}, {"location": "API/ATOM/atomclassifier/#training", "title": "Training", "text": "

        The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.

        runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.

        method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.

        Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.

        The following steps are applied to every model:

        1. Apply hyperparameter tuning (optional).
        2. Fit the model on the training set using the best combination of hyperparameters found.
        3. Evaluate the model on the test set.
        4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

        See the DirectClassifier or DirectRegressor class for a description of the parameters.

        method successive_halving(models, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.

        The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g. only using tree-based models.

        The following steps are applied to every model (per iteration):

        1. Apply hyperparameter tuning (optional).
        2. Fit the model on the training set using the best combination of hyperparameters found.
        3. Evaluate the model on the test set.
        4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

        See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.

        method train_sizing(models, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.

        When training models, there is usually a trade-off between model performance and computation time, that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

        The following steps are applied to every model (per iteration):

        1. Apply hyperparameter tuning (optional).
        2. Fit the model on the training set using the best combination of hyperparameters found.
        3. Evaluate the model on the test set.
        4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

        See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.

        "}, {"location": "API/ATOM/atomforecaster/", "title": "ATOMForecaster", "text": "

        class atom.api.ATOMForecaster(*arrays, y=-1, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine=None, backend=\"loky\", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for forecasting tasks.

        Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.

        All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.

        Parameters*arrays: sequence of indexables Dataset containing exogeneous features and time series. Allowed formats are:

        X, train, test: dataframe-like Exogeneous feature set corresponding to y, with shape=(n_samples, n_features).

        y: int, str or sequence Time series.

        y: int, str, dict, sequence or dataframe, default=-1 Time series.

        This parameter is ignored if the time series is provided through arrays.

        test_size: int or float, default=0.2

        This parameter is ignored if the test set is provided through arrays.

        holdout_size: int, float or None, default=None

        This parameter is ignored if the holdout set is provided through arrays.

        n_rows: int or float, default=1 Subsample of the dataset to use. The cut is made from the head of the dataset (older entries are dropped when sorted by date ascending). The default value selects all rows.

        n_jobs: int, default=1 Number of cores to use for parallel processing.

        device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

        engine: dict or None, default=None Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. If None, the default options are selected. Choose from:

        backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

        verbose: int, default=0 Verbosity level of the class. Choose from:

        warnings: bool or str, default=False

        Changing this parameter affects the PYTHONWARNINGS environment. ATOM can't manage warnings that go from C/C++ code to stdout.

        logger: str, Logger or None, default=None

        experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

        random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

        See Also

        ATOMClassifier Main class for classification tasks.

        ATOMRegressor Main class for regression tasks.

        "}, {"location": "API/ATOM/atomforecaster/#example", "title": "Example", "text": "
        from atom import ATOMForecaster\nfrom sktime.datasets import load_airline\n\ny = load_airline()\n\n# Initialize atom\natom = ATOMForecaster(y, verbose=2)\n\n# Train models\natom.run(models=[\"NF\", \"ES\", \"ETS\"])\n\n# Analyze the results\nprint(atom.results)\n\nprint(atom.evaluate())\n
        "}, {"location": "API/ATOM/atomforecaster/#magic-methods", "title": "Magic methods", "text": "

        The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.

        "}, {"location": "API/ATOM/atomforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomforecaster/#data-attributes", "title": "Data attributes", "text": "

        The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

        Attributespipeline: pd.SeriesTransformers fitted on the data.

        Use this attribute only to access the individual instances. To visualize the pipeline, use the plot_pipeline method.mapping: dictEncoded values and their respective mapped values.

        The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, etc...).dataset: dataframeComplete data set.train: dataframeTraining set.test: dataframeTest set.X: dataframeFeature set.y: series | dataframeTarget column(s).X_train: dataframeFeatures of the training set.y_train: series | dataframeTarget column(s) of the training set.X_test: dataframeFeatures of the test set.y_test: series | dataframeTarget column(s) of the test set.shape: tuple[int, int]Shape of the dataset (n_rows, n_columns).columns: seriesName of all the columns.n_columns: intNumber of columns.features: seriesName of the features.n_features: intNumber of features.target: str | list[str]Name of the target column(s).scaled: boolWhether the feature set is scaled.

        A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only 0s and 1s) are excluded from the calculation.duplicates: seriesNumber of duplicate rows in the dataset.missing: listValues that are considered \"missing\".

        These values are used by the clean and impute methods. Default values are: None, NaN, NaT, +inf, -inf, \"\", \"?\", \"None\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"inf\". Note that None, NaN, +inf and -inf are always considered missing since they are incompatible with sklearn estimators.nans: series | NoneColumns with the number of missing values in them.n_nans: int | NoneNumber of samples containing missing values.numerical: seriesNames of the numerical features in the dataset.n_numerical: intNumber of numerical features in the dataset.categorical: seriesNames of the categorical features in the dataset.n_categorical: intNumber of categorical features in the dataset.outliers: series | NoneColumns in training set with amount of outlier values.n_outliers: int | NoneNumber of samples in the training set containing outliers.

        "}, {"location": "API/ATOM/atomforecaster/#utility-attributes", "title": "Utility attributes", "text": "

        The utility attributes are used to access information about the models in the instance after training.

        Attributesbranch: BranchCurrent active branch.

        Use the property's @setter to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use __from__ to split the new branch from any other existing branch. Read more in the user guide.models: str | list[str] | NoneName of the model(s).metric: str | list[str] | NoneName of the metric(s).winners: list[model] | NoneModels ordered by performance.

        Performance is measured as the highest score on the model's score_bootstrap or score_test attributes, checked in that order. For multi-metric runs, only the main metric is compared. Ties are resolved looking at the lowest time_fit.winner: model | NoneBest performing model.

        Performance is measured as the highest score on the model's score_bootstrap or score_test attributes, checked in that order. For multi-metric runs, only the main metric is compared. Ties are resolved looking at the lowest time_fit.results: pd.DataFrameOverview of the training results.

        All durations are in seconds. Columns include:

        "}, {"location": "API/ATOM/atomforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "

        The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

        Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning.log_model: boolWhether to save the model's estimator after fitting.log_plots: boolWhether to save plots as artifacts.log_data: boolWhether to save the train and test sets.log_pipeline: boolWhether to save the model's pipeline.

        "}, {"location": "API/ATOM/atomforecaster/#plot-attributes", "title": "Plot attributes", "text": "

        The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

        Attributespalette: str | SEQUENCEColor palette.

        Specify one of plotly's built-in palettes or create a custom one, e.g. atom.palette = [\"red\", \"green\", \"blue\"].title_fontsize: intFontsize for the plot's title.label_fontsize: intFontsize for the labels, legend and hover information.tick_fontsize: intFontsize for the ticks along the plot's axes.line_width: intWidth of the line plots.marker_size: intSize of the markers.

        "}, {"location": "API/ATOM/atomforecaster/#utility-methods", "title": "Utility methods", "text": "

        Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.

        addAdd a transformer to the pipeline.applyApply a function to the dataset.automlSearch for an optimized pipeline in an automated fashion.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the pipeline to a sklearn-like object.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoads an atom instance from a pickle file.logPrint message and save to log file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a .csv file.shrinkConverts the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.

        method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.

        If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.

        Warning

        Note

        If the transform method doesn't return a dataframe:

        Note

        If the transformer has a n_jobs and/or random_state parameter that is left to its default value, it adopts atom's value.

        Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a transform method.

        columns: int, str, slice, sequence or None, default=None Names, indices or dtypes of the columns in the dataset to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns. Add ! in front of a name or dtype to exclude that column, e.g. atom.add(Transformer(), columns=\"!Location\")transforms all columns exceptLocation`. You can either include or exclude columns, not combinations of these.

        train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data.

        **fit_params Additional keyword arguments for the transformer's fit method.

        method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.

        The function should have signature func(dataset, **kw_args) -> dataset. This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...

        Note

        This approach is preferred over changing the dataset directly through the property's @setter since the transformation is stored in the pipeline.

        Tip

        Use atom.apply(lambda df: df.drop(\"column_name\",axis=1)) to store the removal of columns in the pipeline.

        Parametersfunc: callable Function to apply.

        inverse_func: callable or None, default=None Inverse function of func. If None, the inverse_transform method returns the input unchanged.

        kw_args: dict or None, default=None Additional keyword arguments for the function.

        inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function.

        method automl(**kwargs)[source]Search for an optimized pipeline in an automated fashion.

        Automated machine learning (AutoML) automates the selection, composition and parameterization of machine learning pipelines. Automating the machine learning often provides faster, more accurate outputs than hand-coded algorithms. ATOM uses the evalML package for AutoML optimization. The resulting transformers and final estimator are merged with atom's pipeline (check the pipeline and models attributes after the method finishes running). The created AutoMLSearch instance can be accessed through the evalml attribute.

        Warning

        AutoML algorithms aren't intended to run for only a few minutes. The method may need a very long time to achieve optimal results.

        Parameters**kwargs Additional keyword arguments for the AutoMLSearch instance.

        method available_models()[source]Give an overview of the available predefined models.

        Returnspd.DataFrame Information about the available predefined models. Columns include:

        method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

        This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

        Parametersrows: int, default=1 Number of plots in length.

        cols: int, default=2 Number of plots in width.

        horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

        vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

        title: str, dict or None, default=None Title for the plot.

        legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

        figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

        filename: str or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

        display: bool, default=True Whether to render the plot.

        Yieldsgo.Figure Plot object.

        method clear()[source]Reset attributes and clear cache from all models.

        Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

        method delete(models=None)[source]Delete models.

        If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

        Parametersmodels: int, str, slice, Model, sequence or None, default=None Models to delete. If None, all models are deleted.

        method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.

        Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

        Tip

        Use the plot_distribution method to plot a column's distribution.

        Parametersdistributions: str, sequence or None, default=None Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

        columns: int, str, slice, sequence or None, default=None Names, positions or dtypes of the columns in the dataset to perform the test on. If None, select all numerical columns.

        Returnspd.DataFrame Statistic results with multiindex levels:

        method eda(dataset=\"dataset\", n_rows=None, filename=None, **kwargs)[source]Create an Exploratory Data Analysis report.

        ATOM uses the ydata-profiling package for the EDA. The report is rendered directly in the notebook. The created ProfileReport instance can be accessed through the report attribute.

        Warning

        This method can be slow for large datasets.

        Parametersdataset: str, default=\"dataset\" Data set to get the report from.

        n_rows: int or None, default=None Number of (randomly picked) rows to process. None to use all rows.

        filename: str or None, default=None Name to save the file with (as .html). None to not save anything.

        **kwargs Additional keyword arguments for the ProfileReport instance.

        method evaluate(metric=None, dataset=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

        Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

        dataset: str, default=\"test\" Data set on which to calculate the metric. Choose from: \"train\", \"test\" or \"holdout\".

        threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

        For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

        sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

        Returnspd.DataFrame Scores of the models.

        method export_pipeline(model=None, memory=None, verbose=None)[source]Export the pipeline to a sklearn-like object.

        Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

        Info

        The returned pipeline behaves similarly to sklearn's Pipeline, and additionally:

        Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

        memory: bool, str, Memory or None, default=None Used to cache the fitted transformers of the pipeline. - If None or False: No caching is performed. - If True: A default temp directory is used. - If str: Path to the caching directory. - If Memory: Object with the joblib.Memory interface.

        verbose: int or None, default=None Verbosity level of the transformers in the pipeline. If None, it leaves them to their original verbosity. Note that this is not the pipeline's own verbose parameter. To change that, use the set_params method.

        ReturnsPipeline Current branch as a sklearn-like Pipeline object.

        method get_class_weight(dataset=\"train\")[source]Return class weights for a balanced data set.

        Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected data set.

        Parametersdataset: str, default=\"train\" Data set from which to get the weights. Choose from: \"train\", \"test\", \"dataset\".

        Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

        method get_sample_weight(dataset=\"train\")[source]Return sample weights for a balanced data set.

        The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

        Parametersdataset: str, default=\"train\" Data set from which to get the weights. Choose from: \"train\", \"test\", \"dataset\".

        Returnsseries Sequence of weights with shape=(n_samples,).

        method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

        Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.

        ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

        y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to X.

        verbose: int or None, default=None Verbosity level for the transformers. If None, it uses the transformer's own verbosity.

        Returnsdataframe Original feature set. Only returned if provided.

        series Original target column. Only returned if provided.

        function atom.atom.load(filename, data=None, transform_data=True, verbose=None)[source]Loads an atom instance from a pickle file.

        If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

        Note

        The loaded instance's current branch is the same branch as it was when saved.

        Parametersfilename: str Name of the pickle file.

        data: sequence of indexables or None, default=None Original dataset. Only use this parameter if the loaded file was saved using save_data=False. Allowed formats are:

        X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str or sequence Target column corresponding to X.

        transform_data: bool, default=True If False, the data is left as provided. If True, it's transformed through all the steps in the loaded instance's pipeline.

        verbose: int or None, default=None Verbosity level of the transformations applied on the new data. If None, use the verbosity from the loaded instance. This parameter is ignored if transform_data=False.

        Returnsatom instance Unpickled atom instance.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

        Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

        Parametersother: Runner Instance with which to merge. Should be of the same class as self.

        suffix: str, default=\"2\" Conflicting branches and models are merged adding suffix to the end of their names.

        method update_layout(**kwargs)[source]Update the properties of the plot's layout.

        Recursively update the structure of the original layout with the values in the arguments.

        Parameters**kwargs Keyword arguments for the figure's update_layout method.

        method update_traces(**kwargs)[source]Update the properties of the plot's traces.

        Recursively update the structure of the original traces with the values in the arguments.

        Parameters**kwargs Keyword arguments for the figure's update_traces method.

        method reset()[source]Reset the instance to it's initial state.

        Deletes all branches and models. The dataset is also reset to its form after initialization.

        method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method save_data(filename=\"auto\", dataset=\"dataset\", **kwargs)[source]Save the data in the current branch to a .csv file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        dataset: str, default=\"dataset\" Data set to save.

        **kwargs Additional keyword arguments for pandas' to_csv method.

        method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Converts the columns to the smallest possible matching dtype.

        Parametersint2bool: bool, default=False Whether to convert int columns to bool type. Only if the values in the column are strictly in (0, 1) or (-1, 1).

        int2uint: bool, default=False Whether to convert int to uint (unsigned integer). Only if the values in the column are strictly positive.

        str2cat: bool, default=False Whether to convert string to category. Only if the number of categories would be less than 30% of the length of the column.

        dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

        columns: int, str, slice, sequence or None, default=None Names, positions or dtypes of the columns in the dataset to shrink. If None, transform all columns.

        method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

        Warning

        Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

        Parametersmodels: slice, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

        name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

        **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

        method stats(_vb=-2)[source]Display basic information about the dataset.

        Parameters_vb: int, default=-2 Internal parameter to always print if called by user.

        method status()[source]Get an overview of the branches and models.

        This method prints the same information as the __repr__ and also saves it to the logger.

        method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

        Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

        y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to X.

        verbose: int or None, default=None Verbosity level for the transformers. If None, it uses the transformer's own verbosity.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

        Warning

        Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

        Parametersmodels: slice, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

        name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

        **kwargs Additional keyword arguments for sklearn's voting instance.

        "}, {"location": "API/ATOM/atomforecaster/#data-cleaning", "title": "Data cleaning", "text": "

        The data cleaning methods can help you scale the data, handle missing values, categorical columns and outliers. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.

        Tip

        Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.

        cleanApplies standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.

        method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Applies standard data cleaning steps on the dataset.

        Use the parameters to choose which transformations to perform. The available steps are:

        See the Cleaner class for a description of the parameters.

        method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.

        For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

        See the Discretizer class for a description of the parameters.

        Tip

        Use the plot_distribution method to visualize a column's distribution and decide on the bins.

        method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.

        The encoding type depends on the number of classes in the column:

        Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.

        See the Encoder class for a description of the parameters.

        Note

        This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.

        Tip

        Use the categorical attribute for a list of the categorical features in the dataset.

        method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.

        Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

        See the Imputer class for a description of the parameters.

        Tip

        Use the nans attribute to check the amount of missing values per column.

        method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

        This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.

        See the Normalizer class for a description of the parameters.

        Tip

        Use the plot_distribution method to examine a column's distribution.

        method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.

        Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

        See the Pruner class for a description of the parameters.

        Note

        This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.

        Tip

        Use the outliers attribute to check the number of outliers per column.

        method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.

        Apply one of sklearn's scalers. Categorical columns are ignored.

        See the Scaler class for a description of the parameters.

        Tip

        Use the scaled attribute to check whether the dataset is scaled.

        "}, {"location": "API/ATOM/atomforecaster/#nlp", "title": "NLP", "text": "

        The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called corpus. Read more in the user guide.

        textcleanApplies standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.

        method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Applies standard text cleaning to the corpus.

        Transformations include normalizing characters and dropping noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

        See the TextCleaner class for a description of the parameters.

        method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.

        Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

        See the TextNormalizer class for a description of the parameters.

        method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.

        Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g. \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

        See the Tokenizer class for a description of the parameters.

        method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.

        Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

        If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

        See the Vectorizer class for a description of the parameters.

        "}, {"location": "API/ATOM/atomforecaster/#feature-engineering", "title": "Feature engineering", "text": "

        To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.

        feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.

        method feature_extraction(features=['day', 'month', 'year'], fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.

        Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

        See the FeatureExtractor class for a description of the parameters.

        method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.

        Create new combinations of existing features to capture the non-linear relations between the original features.

        See the FeatureGenerator class for a description of the parameters.

        method feature_grouping(group, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.

        Replace groups of features with related characteristics with new features that summarize statistical properties of te group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

        See the FeatureGrouper class for a description of the parameters.

        method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.

        Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low variance features.

        See the FeatureSelector class for a description of the parameters.

        Note

        "}, {"location": "API/ATOM/atomforecaster/#training", "title": "Training", "text": "

        The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.

        runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.

        method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.

        Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.

        The following steps are applied to every model:

        1. Apply hyperparameter tuning (optional).
        2. Fit the model on the training set using the best combination of hyperparameters found.
        3. Evaluate the model on the test set.
        4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

        See the DirectClassifier or DirectRegressor class for a description of the parameters.

        method successive_halving(models, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.

        The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g. only using tree-based models.

        The following steps are applied to every model (per iteration):

        1. Apply hyperparameter tuning (optional).
        2. Fit the model on the training set using the best combination of hyperparameters found.
        3. Evaluate the model on the test set.
        4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

        See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.

        method train_sizing(models, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.

        When training models, there is usually a trade-off between model performance and computation time, that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

        The following steps are applied to every model (per iteration):

        1. Apply hyperparameter tuning (optional).
        2. Fit the model on the training set using the best combination of hyperparameters found.
        3. Evaluate the model on the test set.
        4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

        See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.

        "}, {"location": "API/ATOM/atommodel/", "title": "ATOMModel", "text": "

        function atom.api.ATOMModel(estimator, name=None, acronym=None, needs_scaling=False, native_multilabel=False, native_multioutput=False, has_validation=None)[source]Convert an estimator to a model that can be ingested by atom.

        This function adds the relevant attributes to the estimator so that they can be used by atom. Note that only estimators that follow sklearn's API are compatible.

        Read more about using custom models in the user guide.

        Parametersestimator: Predictor Custom estimator. Should implement a fit and predict method.

        name: str or None, default=None Name for the model. This is the value used to call the model from atom. The value should start with the model's acronym when specified. If None, the capital letters of the estimator's name are used (only if two or more, else it uses the entire name).

        acronym: str or None, default=None Model's acronym. If None, it uses the model's name. Specify this parameter when you want to train multiple custom models that share the same estimator.

        needs_scaling: bool, default=False Whether the model should use automated feature scaling.

        native_multilabel: bool, default=False Whether the model has native support for multilabel tasks. If False and the task is multilabel, a multilabel meta-estimator is wrapper around the estimator.

        native_multioutput: bool, default=False Whether the model has native support for multioutput tasks. If False and the task is multiouput, a multiotuput meta-estimator is wrapper around the estimator.

        has_validation: str or None, default=None Whether the model allows in-training validation. If str, name of the estimator's parameter that states the number of iterations. If None, no support for in-training validation.

        Returnsestimator Clone of the provided estimator with custom attributes.

        "}, {"location": "API/ATOM/atommodel/#example", "title": "Example", "text": "
        from atom import ATOMRegressor, ATOMModel\nfrom sklearn.datasets import load_diabetes\nfrom sklearn.linear_model import RANSACRegressor\n\nransac = ATOMModel(\n    estimator=RANSACRegressor(),\n    name=\"RANSAC\",\n    needs_scaling=False,\n)\n\nX, y = load_diabetes(return_X_y=True, as_frame=True)\n\natom = ATOMRegressor(X, y, verbose=2)\natom.run(ransac)\n
        "}, {"location": "API/ATOM/atomregressor/", "title": "ATOMRegressor", "text": "

        class atom.api.ATOMRegressor(*arrays, y=-1, index=False, shuffle=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine=None, backend=\"loky\", verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for regression tasks.

        Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.

        All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.

        Parameters*arrays: sequence of indexables Dataset containing features and target. Allowed formats are:

        X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str or sequence Target column corresponding to X.

        y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to X.

        This parameter is ignored if the target column is provided through arrays.

        index: bool, int, str or sequence, default=False Handle the index in the resulting dataframe.

        test_size: int or float, default=0.2

        This parameter is ignored if the test set is provided through arrays.

        holdout_size: int, float or None, default=None

        This parameter is ignored if the holdout set is provided through arrays.

        shuffle: bool, default=True Whether to shuffle the dataset before splitting the train and test set. Be aware that not shuffling the dataset can cause an unequal distribution of target classes over the sets.

        n_rows: int or float, default=1 Random subsample of the dataset to use. The default value selects all rows.

        n_jobs: int, default=1 Number of cores to use for parallel processing.

        device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

        engine: dict or None, default=None Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. If None, the default options are selected. Choose from:

        backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

        verbose: int, default=0 Verbosity level of the class. Choose from:

        warnings: bool or str, default=False

        Changing this parameter affects the PYTHONWARNINGS environment. ATOM can't manage warnings that go from C/C++ code to stdout.

        logger: str, Logger or None, default=None

        experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed.

        random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

        See Also

        ATOMClassifier Main class for classification tasks.

        ATOMForecaster Main class for forecasting tasks.

        "}, {"location": "API/ATOM/atomregressor/#example", "title": "Example", "text": "
        from atom import ATOMRegressor\nfrom sklearn.datasets import load_diabetes\n\nX, y = load_diabetes(return_X_y=True, as_frame=True)\n\n# Initialize atom\natom = ATOMRegressor(X, y, verbose=2)\n\n# Apply data cleaning and feature engineering methods\natom.scale()\natom.feature_selection(strategy=\"rfecv\", solver=\"xgb\", n_features=12)\n\n# Train models\natom.run(models=[\"OLS\", \"RF\", \"XGB\"])\n\n# Analyze the results\nprint(atom.results)\n\nprint(atom.evaluate())\n
        "}, {"location": "API/ATOM/atomregressor/#magic-methods", "title": "Magic methods", "text": "

        The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.

        "}, {"location": "API/ATOM/atomregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomregressor/#data-attributes", "title": "Data attributes", "text": "

        The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.

        Attributespipeline: pd.SeriesTransformers fitted on the data.

        Use this attribute only to access the individual instances. To visualize the pipeline, use the plot_pipeline method.mapping: dictEncoded values and their respective mapped values.

        The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, etc...).dataset: dataframeComplete data set.train: dataframeTraining set.test: dataframeTest set.X: dataframeFeature set.y: series | dataframeTarget column(s).X_train: dataframeFeatures of the training set.y_train: series | dataframeTarget column(s) of the training set.X_test: dataframeFeatures of the test set.y_test: series | dataframeTarget column(s) of the test set.shape: tuple[int, int]Shape of the dataset (n_rows, n_columns).columns: seriesName of all the columns.n_columns: intNumber of columns.features: seriesName of the features.n_features: intNumber of features.target: str | list[str]Name of the target column(s).scaled: boolWhether the feature set is scaled.

        A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only 0s and 1s) are excluded from the calculation.duplicates: seriesNumber of duplicate rows in the dataset.missing: listValues that are considered \"missing\".

        These values are used by the clean and impute methods. Default values are: None, NaN, NaT, +inf, -inf, \"\", \"?\", \"None\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"inf\". Note that None, NaN, +inf and -inf are always considered missing since they are incompatible with sklearn estimators.nans: series | NoneColumns with the number of missing values in them.n_nans: int | NoneNumber of samples containing missing values.numerical: seriesNames of the numerical features in the dataset.n_numerical: intNumber of numerical features in the dataset.categorical: seriesNames of the categorical features in the dataset.n_categorical: intNumber of categorical features in the dataset.outliers: series | NoneColumns in training set with amount of outlier values.n_outliers: int | NoneNumber of samples in the training set containing outliers.

        "}, {"location": "API/ATOM/atomregressor/#utility-attributes", "title": "Utility attributes", "text": "

        The utility attributes are used to access information about the models in the instance after training.

        Attributesbranch: BranchCurrent active branch.

        Use the property's @setter to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use __from__ to split the new branch from any other existing branch. Read more in the user guide.models: str | list[str] | NoneName of the model(s).metric: str | list[str] | NoneName of the metric(s).winners: list[model] | NoneModels ordered by performance.

        Performance is measured as the highest score on the model's score_bootstrap or score_test attributes, checked in that order. For multi-metric runs, only the main metric is compared. Ties are resolved looking at the lowest time_fit.winner: model | NoneBest performing model.

        Performance is measured as the highest score on the model's score_bootstrap or score_test attributes, checked in that order. For multi-metric runs, only the main metric is compared. Ties are resolved looking at the lowest time_fit.results: pd.DataFrameOverview of the training results.

        All durations are in seconds. Columns include:

        "}, {"location": "API/ATOM/atomregressor/#tracking-attributes", "title": "Tracking attributes", "text": "

        The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.

        Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning.log_model: boolWhether to save the model's estimator after fitting.log_plots: boolWhether to save plots as artifacts.log_data: boolWhether to save the train and test sets.log_pipeline: boolWhether to save the model's pipeline.

        "}, {"location": "API/ATOM/atomregressor/#plot-attributes", "title": "Plot attributes", "text": "

        The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.

        Attributespalette: str | SEQUENCEColor palette.

        Specify one of plotly's built-in palettes or create a custom one, e.g. atom.palette = [\"red\", \"green\", \"blue\"].title_fontsize: intFontsize for the plot's title.label_fontsize: intFontsize for the labels, legend and hover information.tick_fontsize: intFontsize for the ticks along the plot's axes.line_width: intWidth of the line plots.marker_size: intSize of the markers.

        "}, {"location": "API/ATOM/atomregressor/#utility-methods", "title": "Utility methods", "text": "

        Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.

        addAdd a transformer to the pipeline.applyApply a function to the dataset.automlSearch for an optimized pipeline in an automated fashion.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the pipeline to a sklearn-like object.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoads an atom instance from a pickle file.logPrint message and save to log file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a .csv file.shrinkConverts the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.

        method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.

        If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.

        Warning

        Note

        If the transform method doesn't return a dataframe:

        Note

        If the transformer has a n_jobs and/or random_state parameter that is left to its default value, it adopts atom's value.

        Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a transform method.

        columns: int, str, slice, sequence or None, default=None Names, indices or dtypes of the columns in the dataset to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns. Add ! in front of a name or dtype to exclude that column, e.g. atom.add(Transformer(), columns=\"!Location\")transforms all columns exceptLocation`. You can either include or exclude columns, not combinations of these.

        train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data.

        **fit_params Additional keyword arguments for the transformer's fit method.

        method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.

        The function should have signature func(dataset, **kw_args) -> dataset. This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...

        Note

        This approach is preferred over changing the dataset directly through the property's @setter since the transformation is stored in the pipeline.

        Tip

        Use atom.apply(lambda df: df.drop(\"column_name\",axis=1)) to store the removal of columns in the pipeline.

        Parametersfunc: callable Function to apply.

        inverse_func: callable or None, default=None Inverse function of func. If None, the inverse_transform method returns the input unchanged.

        kw_args: dict or None, default=None Additional keyword arguments for the function.

        inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function.

        method automl(**kwargs)[source]Search for an optimized pipeline in an automated fashion.

        Automated machine learning (AutoML) automates the selection, composition and parameterization of machine learning pipelines. Automating the machine learning often provides faster, more accurate outputs than hand-coded algorithms. ATOM uses the evalML package for AutoML optimization. The resulting transformers and final estimator are merged with atom's pipeline (check the pipeline and models attributes after the method finishes running). The created AutoMLSearch instance can be accessed through the evalml attribute.

        Warning

        AutoML algorithms aren't intended to run for only a few minutes. The method may need a very long time to achieve optimal results.

        Parameters**kwargs Additional keyword arguments for the AutoMLSearch instance.

        method available_models()[source]Give an overview of the available predefined models.

        Returnspd.DataFrame Information about the available predefined models. Columns include:

        method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.

        This @contextmanager allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.

        Parametersrows: int, default=1 Number of plots in length.

        cols: int, default=2 Number of plots in width.

        horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size.

        vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size.

        title: str, dict or None, default=None Title for the plot.

        legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices.

        figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas.

        filename: str or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

        display: bool, default=True Whether to render the plot.

        Yieldsgo.Figure Plot object.

        method clear()[source]Reset attributes and clear cache from all models.

        Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:

        method delete(models=None)[source]Delete models.

        If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.

        Parametersmodels: int, str, slice, Model, sequence or None, default=None Models to delete. If None, all models are deleted.

        method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.

        Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.

        Tip

        Use the plot_distribution method to plot a column's distribution.

        Parametersdistributions: str, sequence or None, default=None Names of the distributions in scipy.stats to get the statistics on. If None, a selection of the most common ones is used.

        columns: int, str, slice, sequence or None, default=None Names, positions or dtypes of the columns in the dataset to perform the test on. If None, select all numerical columns.

        Returnspd.DataFrame Statistic results with multiindex levels:

        method eda(dataset=\"dataset\", n_rows=None, filename=None, **kwargs)[source]Create an Exploratory Data Analysis report.

        ATOM uses the ydata-profiling package for the EDA. The report is rendered directly in the notebook. The created ProfileReport instance can be accessed through the report attribute.

        Warning

        This method can be slow for large datasets.

        Parametersdataset: str, default=\"dataset\" Data set to get the report from.

        n_rows: int or None, default=None Number of (randomly picked) rows to process. None to use all rows.

        filename: str or None, default=None Name to save the file with (as .html). None to not save anything.

        **kwargs Additional keyword arguments for the ProfileReport instance.

        method evaluate(metric=None, dataset=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.

        Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task.

        dataset: str, default=\"test\" Data set on which to calculate the metric. Choose from: \"train\", \"test\" or \"holdout\".

        threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when:

        For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.

        sample_weight: sequence or None, default=None Sample weights corresponding to y in dataset.

        Returnspd.DataFrame Scores of the models.

        method export_pipeline(model=None, memory=None, verbose=None)[source]Export the pipeline to a sklearn-like object.

        Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.

        Info

        The returned pipeline behaves similarly to sklearn's Pipeline, and additionally:

        Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported.

        memory: bool, str, Memory or None, default=None Used to cache the fitted transformers of the pipeline. - If None or False: No caching is performed. - If True: A default temp directory is used. - If str: Path to the caching directory. - If Memory: Object with the joblib.Memory interface.

        verbose: int or None, default=None Verbosity level of the transformers in the pipeline. If None, it leaves them to their original verbosity. Note that this is not the pipeline's own verbose parameter. To change that, use the set_params method.

        ReturnsPipeline Current branch as a sklearn-like Pipeline object.

        method get_class_weight(dataset=\"train\")[source]Return class weights for a balanced data set.

        Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected data set.

        Parametersdataset: str, default=\"train\" Data set from which to get the weights. Choose from: \"train\", \"test\", \"dataset\".

        Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks.

        method get_sample_weight(dataset=\"train\")[source]Return sample weights for a balanced data set.

        The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of y will be multiplied.

        Parametersdataset: str, default=\"train\" Data set from which to get the weights. Choose from: \"train\", \"test\", \"dataset\".

        Returnsseries Sequence of weights with shape=(n_samples,).

        method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.

        Transformers that are only applied on the training set are skipped. The rest should all implement a inverse_transform method. If only X or only y is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.

        ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers.

        y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to X.

        verbose: int or None, default=None Verbosity level for the transformers. If None, it uses the transformer's own verbosity.

        Returnsdataframe Original feature set. Only returned if provided.

        series Original target column. Only returned if provided.

        function atom.atom.load(filename, data=None, transform_data=True, verbose=None)[source]Loads an atom instance from a pickle file.

        If the instance was saved using save_data=False, it's possible to load new data into it and apply all data transformations.

        Note

        The loaded instance's current branch is the same branch as it was when saved.

        Parametersfilename: str Name of the pickle file.

        data: sequence of indexables or None, default=None Original dataset. Only use this parameter if the loaded file was saved using save_data=False. Allowed formats are:

        X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str or sequence Target column corresponding to X.

        transform_data: bool, default=True If False, the data is left as provided. If True, it's transformed through all the steps in the loaded instance's pipeline.

        verbose: int or None, default=None Verbosity level of the transformations applied on the new data. If None, use the verbosity from the loaded instance. This parameter is ignored if transform_data=False.

        Returnsatom instance Unpickled atom instance.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.

        Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the suffix parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.

        Parametersother: Runner Instance with which to merge. Should be of the same class as self.

        suffix: str, default=\"2\" Conflicting branches and models are merged adding suffix to the end of their names.

        method update_layout(**kwargs)[source]Update the properties of the plot's layout.

        Recursively update the structure of the original layout with the values in the arguments.

        Parameters**kwargs Keyword arguments for the figure's update_layout method.

        method update_traces(**kwargs)[source]Update the properties of the plot's traces.

        Recursively update the structure of the original traces with the values in the arguments.

        Parameters**kwargs Keyword arguments for the figure's update_traces method.

        method reset()[source]Reset the instance to it's initial state.

        Deletes all branches and models. The dataset is also reset to its form after initialization.

        method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method save_data(filename=\"auto\", dataset=\"dataset\", **kwargs)[source]Save the data in the current branch to a .csv file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        dataset: str, default=\"dataset\" Data set to save.

        **kwargs Additional keyword arguments for pandas' to_csv method.

        method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Converts the columns to the smallest possible matching dtype.

        Parametersint2bool: bool, default=False Whether to convert int columns to bool type. Only if the values in the column are strictly in (0, 1) or (-1, 1).

        int2uint: bool, default=False Whether to convert int to uint (unsigned integer). Only if the values in the column are strictly positive.

        str2cat: bool, default=False Whether to convert string to category. Only if the number of categories would be less than 30% of the length of the column.

        dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column.

        columns: int, str, slice, sequence or None, default=None Names, positions or dtypes of the columns in the dataset to shrink. If None, transform all columns.

        method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.

        Warning

        Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

        Parametersmodels: slice, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

        name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: Stack.

        **kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the final_estimator parameter.

        method stats(_vb=-2)[source]Display basic information about the dataset.

        Parameters_vb: int, default=-2 Internal parameter to always print if called by user.

        method status()[source]Get an overview of the branches and models.

        This method prints the same information as the __repr__ and also saves it to the logger.

        method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.

        Transformers that are only applied on the training set are skipped. If only X or only y is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers.

        y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to X.

        verbose: int or None, default=None Verbosity level for the transformers. If None, it uses the transformer's own verbosity.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.

        Warning

        Combining models trained on different branches into one ensemble is not allowed and will raise an exception.

        Parametersmodels: slice, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch.

        name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: Vote.

        **kwargs Additional keyword arguments for sklearn's voting instance.

        "}, {"location": "API/ATOM/atomregressor/#data-cleaning", "title": "Data cleaning", "text": "

        The data cleaning methods can help you scale the data, handle missing values, categorical columns and outliers. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.

        Tip

        Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.

        cleanApplies standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.

        method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Applies standard data cleaning steps on the dataset.

        Use the parameters to choose which transformations to perform. The available steps are:

        See the Cleaner class for a description of the parameters.

        method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.

        For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.

        See the Discretizer class for a description of the parameters.

        Tip

        Use the plot_distribution method to visualize a column's distribution and decide on the bins.

        method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.

        The encoding type depends on the number of classes in the column:

        Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.

        See the Encoder class for a description of the parameters.

        Note

        This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.

        Tip

        Use the categorical attribute for a list of the categorical features in the dataset.

        method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.

        Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

        See the Imputer class for a description of the parameters.

        Tip

        Use the nans attribute to check the amount of missing values per column.

        method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

        This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.

        See the Normalizer class for a description of the parameters.

        Tip

        Use the plot_distribution method to examine a column's distribution.

        method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.

        Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

        See the Pruner class for a description of the parameters.

        Note

        This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.

        Tip

        Use the outliers attribute to check the number of outliers per column.

        method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.

        Apply one of sklearn's scalers. Categorical columns are ignored.

        See the Scaler class for a description of the parameters.

        Tip

        Use the scaled attribute to check whether the dataset is scaled.

        "}, {"location": "API/ATOM/atomregressor/#nlp", "title": "NLP", "text": "

        The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called corpus. Read more in the user guide.

        textcleanApplies standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.

        method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Applies standard text cleaning to the corpus.

        Transformations include normalizing characters and dropping noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised.

        See the TextCleaner class for a description of the parameters.

        method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.

        Convert words to a more uniform standard. The transformations are applied on the column named corpus, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.

        See the TextNormalizer class for a description of the parameters.

        method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.

        Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g. \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named corpus. If there is no column with that name, an exception is raised.

        See the Tokenizer class for a description of the parameters.

        method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.

        Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named corpus. If there is no column with that name, an exception is raised.

        If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix corpus_. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.

        See the Vectorizer class for a description of the parameters.

        "}, {"location": "API/ATOM/atomregressor/#feature-engineering", "title": "Feature engineering", "text": "

        To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.

        feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.

        method feature_extraction(features=['day', 'month', 'year'], fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.

        Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

        See the FeatureExtractor class for a description of the parameters.

        method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.

        Create new combinations of existing features to capture the non-linear relations between the original features.

        See the FeatureGenerator class for a description of the parameters.

        method feature_grouping(group, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.

        Replace groups of features with related characteristics with new features that summarize statistical properties of te group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

        See the FeatureGrouper class for a description of the parameters.

        method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.

        Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low variance features.

        See the FeatureSelector class for a description of the parameters.

        Note

        "}, {"location": "API/ATOM/atomregressor/#training", "title": "Training", "text": "

        The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.

        runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.

        method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.

        Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.

        The following steps are applied to every model:

        1. Apply hyperparameter tuning (optional).
        2. Fit the model on the training set using the best combination of hyperparameters found.
        3. Evaluate the model on the test set.
        4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

        See the DirectClassifier or DirectRegressor class for a description of the parameters.

        method successive_halving(models, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.

        The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g. only using tree-based models.

        The following steps are applied to every model (per iteration):

        1. Apply hyperparameter tuning (optional).
        2. Fit the model on the training set using the best combination of hyperparameters found.
        3. Evaluate the model on the test set.
        4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

        See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.

        method train_sizing(models, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.

        When training models, there is usually a trade-off between model performance and computation time, that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.

        The following steps are applied to every model (per iteration):

        1. Apply hyperparameter tuning (optional).
        2. Fit the model on the training set using the best combination of hyperparameters found.
        3. Evaluate the model on the test set.
        4. Train the estimator on various bootstrapped samples of the training set and evaluate again on the test set (optional).

        See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.

        "}, {"location": "API/data_cleaning/balancer/", "title": "Balancer", "text": "

        class atom.data_cleaning.Balancer(strategy=\"ADASYN\", n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)[source]Balance the number of samples per class in the target column.

        When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the N-th sample in the data set. Use only for classification tasks.

        This class can be accessed from atom through the balance method. Read more in the user guide.

        Warning

        Parametersstrategy: str or estimator, default=\"ADASYN\" Type of algorithm with which to balance the dataset. Choose from the name of any estimator in the imbalanced-learn package or provide a custom instance of such.

        n_jobs: int, default=1 Number of cores to use for parallel processing.

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

        **kwargs Additional keyword arguments for the strategy estimator.

        Attributes[strategy]: imblearn estimator Object (lowercase strategy) used to balance the data, e.g. balancer.adasyn for the default strategy.

        mapping: dict Target values mapped to their respective encoded integer.

        See Also

        Encoder Perform encoding of categorical features.

        Imputer Handle missing values in the data.

        Pruner Prune outliers from the data.

        "}, {"location": "API/data_cleaning/balancer/#example", "title": "Example", "text": "atomstand-alone
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, random_state=1)\nprint(atom.train)\n\natom.balance(strategy=\"smote\", verbose=2)\n\n# Note that the number of rows has increased\nprint(atom.train)\n
        from atom.data_cleaning import Balancer\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\nprint(X)\n\nbalancer = Balancer(strategy=\"smote\", verbose=2)\nX, y = balancer.transform(X, y)\n\n# Note that the number of rows has increased\nprint(X)\n
        "}, {"location": "API/data_cleaning/balancer/#methods", "title": "Methods", "text": "

        fitDoes nothing.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDoes nothing.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformBalance the data.

        method fit(X=None, y=None, **fit_params)[source]Does nothing.

        Implemented for continuity of the API.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsself Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X=None, y=None)[source]Does nothing.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=-1)[source]Balance the data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str or sequence, default=-1 Target column corresponding to X.

        Returnsdataframe Balanced dataframe.

        series Transformed target column.

        "}, {"location": "API/data_cleaning/cleaner/", "title": "Cleaner", "text": "

        class atom.data_cleaning.Cleaner(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, device=\"cpu\", engine=None, verbose=0, logger=None)[source]Applies standard data cleaning steps on a dataset.

        Use the parameters to choose which transformations to perform. The available steps are:

        This class can be accessed from atom through the clean method. Read more in the user guide.

        Parametersconvert_dtypes: bool, default=True Convert the column's data types to the best possible types that support pd.NA.

        drop_dtypes: str, sequence or None, default=None Columns with these data types are dropped from the dataset.

        drop_chars: str or None, default=None Remove the specified regex pattern from column names, e.g. [^A-Za-z0-9]+ to remove all non-alphanumerical characters.

        strip_categorical: bool, default=True Whether to strip spaces from categorical columns.

        drop_duplicates: bool, default=False Whether to drop duplicate rows. Only the first occurrence of every duplicated row is kept.

        drop_missing_target: bool, default=True Whether to drop rows with missing values in the target column. This transformation is ignored if y is not provided.

        encode_target: bool, default=True Whether to encode the target column(s). This includes converting categorical columns to numerical, and binarizing multilabel columns. This transformation is ignored if y is not provided.

        device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

        engine: dict or None, default=None Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. If None, the default options are selected. Choose from:

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        Attributesmissing: list Values that are considered \"missing\". Default values are: \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, +inf and -inf are always considered missing since they are incompatible with sklearn estimators.

        mapping: dict Target values mapped to their respective encoded integer. Only available if encode_target=True.

        feature_names_in_: np.array Names of features seen during fit.

        target_names_in_: np.array Names of target columns seen during fit.

        n_features_in_: int Number of features seen during fit.

        See Also

        Encoder Perform encoding of categorical features.

        Discretizer Bin continuous data into intervals.

        Scaler Scale the data.

        "}, {"location": "API/data_cleaning/cleaner/#example", "title": "Example", "text": "atomstand-alone
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\ny = [\"a\" if i else \"b\" for i in y]\n\natom = ATOMClassifier(X, y, random_state=1)\nprint(atom.y)\n\natom.clean(verbose=2)\n\nprint(atom.y)\n
        from atom.data_cleaning import Cleaner\nfrom numpy.random import randint\n\ny = [\"a\" if i else \"b\" for i in range(randint(100))]\n\ncleaner = Cleaner(verbose=2)\ny = cleaner.fit_transform(y=y)\n\nprint(y)\n
        "}, {"location": "API/data_cleaning/cleaner/#methods", "title": "Methods", "text": "

        fitFit to data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformInversely transform the label encoding.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformApply the data cleaning steps to the data.

        method fit(X=None, y=None)[source]Fit to data.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to X.

        ReturnsCleaner Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X=None, y=None)[source]Inversely transform the label encoding.

        This method only inversely transforms the target encoding. The rest of the transformations can't be inverted. If encode_target=False, the data is returned as is.

        ParametersX: dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Unchanged feature set. Only returned if provided.

        series Original target column. Only returned if provided.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X=None, y=None)[source]Apply the data cleaning steps to the data.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        "}, {"location": "API/data_cleaning/discretizer/", "title": "Discretizer", "text": "

        class atom.data_cleaning.Discretizer(strategy=\"quantile\", bins=5, labels=None, device=\"cpu\", engine=None, verbose=0, logger=None, random_state=None)[source]Bin continuous data into intervals.

        For each feature, the bin edges are computed during fit and, together with the number of bins, they define the intervals. Ignores categorical columns.

        This class can be accessed from atom through the discretize method. Read more in the user guide.

        Tip

        The transformation returns categorical columns. Use the Encoder class to convert them back to numerical types.

        Parametersstrategy: str, default=\"quantile\" Strategy used to define the widths of the bins. Choose from:

        bins: int, sequence or dict, default=5 Bin number or bin edges in which to split every column.

        labels: sequence, dict or None, default=None Label names with which to replace the binned intervals.

        device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

        engine: dict or None, default=None Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. If None, the default options are selected. Choose from:

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random. Only for strategy=\"quantile\".

        Attributesfeature_names_in_: np.array Names of features seen during fit.

        n_features_in_: int Number of features seen during fit.

        See Also

        Encoder Perform encoding of categorical features.

        Imputer Handle missing values in the data.

        Normalizer Transform the data to follow a Normal/Gaussian distribution.

        "}, {"location": "API/data_cleaning/discretizer/#example", "title": "Example", "text": "atomstand-alone
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, random_state=1)\nprint(atom[\"mean radius\"])\n\natom.discretize(\n    strategy=\"custom\",\n    bins=[13, 18],\n    labels=[\"small\", \"medium\", \"large\"],\n    verbose=2,\n    columns=\"mean radius\",\n)\n\nprint(atom[\"mean radius\"])\n
        from atom.data_cleaning import Discretizer\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\nprint(X[\"mean radius\"])\n\ndisc = Discretizer(\n    strategy=\"custom\",\n    bins=[13, 18],\n    labels=[\"small\", \"medium\", \"large\"],\n    verbose=2,\n)\nX[\"mean radius\"] = disc.fit_transform(X[[\"mean radius\"]])[\"mean radius\"]\n\nprint(X[\"mean radius\"])\n
        "}, {"location": "API/data_cleaning/discretizer/#methods", "title": "Methods", "text": "

        fitFit to data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDoes nothing.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformBin the data into intervals.

        method fit(X, y=None)[source]Fit to data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        ReturnsDiscretizer Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X=None, y=None)[source]Does nothing.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=None)[source]Bin the data into intervals.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        Returnsdataframe Transformed feature set.

        "}, {"location": "API/data_cleaning/encoder/", "title": "Encoder", "text": "

        class atom.data_cleaning.Encoder(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"infrequent\", verbose=0, logger=None, **kwargs)[source]Perform encoding of categorical features.

        The encoding type depends on the number of classes in the column:

        Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Infrequent classes can be replaced with a value in order to prevent too high cardinality.

        This class can be accessed from atom through the encode method. Read more in the user guide.

        Warning

        Three category-encoders estimators are unavailable:

        Parametersstrategy: str or estimator, default=\"Target\" Type of encoding to use for high cardinality features. Choose from any of the estimators in the category-encoders package or provide a custom one.

        max_onehot: int or None, default=10 Maximum number of unique values in a feature to perform one-hot encoding. If None, strategy-encoding is always used for columns with more than two classes.

        ordinal: dict or None, default=None Order of ordinal features, where the dict key is the feature's name and the value is the class order, e.g. {\"salary\": [\"low\", \"medium\", \"high\"]}.

        infrequent_to_value: int, float or None, default=None Replaces infrequent class occurrences in categorical columns with the string in parameter value. This transformation is done before the encoding of the column.

        value: str, default=\"infrequent\" Value with which to replace rare classes. This parameter is ignored if infrequent_to_value=None.

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        **kwargs Additional keyword arguments for the strategy estimator.

        Attributesmapping: dict of dicts Encoded values and their respective mapping. The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, etc...).

        feature_names_in_: np.array Names of features seen during fit.

        n_features_in_: int Number of features seen during fit.

        See Also

        Cleaner Applies standard data cleaning steps on a dataset.

        Imputer Handle missing values in the data.

        Pruner Prune outliers from the data.

        "}, {"location": "API/data_cleaning/encoder/#example", "title": "Example", "text": "atomstand-alone
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\nfrom numpy.random import randint\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\nX[\"cat_feature_1\"] = [f\"x{i}\" for i in randint(0, 2, len(X))]\nX[\"cat_feature_2\"] = [f\"x{i}\" for i in randint(0, 3, len(X))]\nX[\"cat_feature_3\"] = [f\"x{i}\" for i in randint(0, 20, len(X))]\n\natom = ATOMClassifier(X, y, random_state=1)\nprint(atom.X)\n\natom.encode(strategy=\"target\", max_onehot=10, verbose=2)\n\n# Note the one-hot encoded column with name [feature]_[class]\nprint(atom.X)\n
        from atom.data_cleaning import Encoder\nfrom sklearn.datasets import load_breast_cancer\nfrom numpy.random import randint\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\nX[\"cat_feature_1\"] = [f\"x{i}\" for i in randint(0, 2, len(X))]\nX[\"cat_feature_2\"] = [f\"x{i}\" for i in randint(0, 3, len(X))]\nX[\"cat_feature_3\"] = [f\"x{i}\" for i in randint(0, 20, len(X))]\nprint(X)\n\nencoder = Encoder(strategy=\"target\", max_onehot=10, verbose=2)\nX = encoder.fit_transform(X, y)\n\n# Note the one-hot encoded column with name [feature]_[class]\nprint(X)\n
        "}, {"location": "API/data_cleaning/encoder/#methods", "title": "Methods", "text": "

        fitFit to data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDoes nothing.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformEncode the data.

        method fit(X, y=None)[source]Fit to data.

        Note that leaving y=None can lead to errors if the strategy encoder requires target values. For multioutput tasks, only the first target column is used to fit the encoder.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, dict, sequence or dataframe-like Target column corresponding to X.

        ReturnsEncoder Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X=None, y=None)[source]Does nothing.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=None)[source]Encode the data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        Returnsdataframe Encoded dataframe.

        "}, {"location": "API/data_cleaning/imputer/", "title": "Imputer", "text": "

        class atom.data_cleaning.Imputer(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, device=\"cpu\", engine=None, verbose=0, logger=None)[source]Handle missing values in the data.

        Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the missing attribute to customize what are considered \"missing values\".

        This class can be accessed from atom through the impute method. Read more in the user guide.

        Parametersstrat_num: str, int or float, default=\"drop\" Imputing strategy for numerical columns. Choose from:

        strat_cat: str, default=\"drop\" Imputing strategy for categorical columns. Choose from:

        max_nan_rows: int, float or None, default=None Maximum number or fraction of missing values in a row (if more, the row is removed). If None, ignore this step.

        max_nan_cols: int, float or None, default=None Maximum number or fraction of missing values in a column (if more, the column is removed). If None, ignore this step.

        device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

        engine: dict or None, default=None Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. If None, the default options are selected. Choose from:

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        Attributesmissing: list Values that are considered \"missing\". Default values are: \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, +inf and -inf are always considered missing since they are incompatible with sklearn estimators.

        feature_names_in_: np.array Names of features seen during fit.

        n_features_in_: int Number of features seen during fit.

        See Also

        Balancer Balance the number of samples per class in the target column.

        Discretizer Bin continuous data into intervals.

        Encoder Perform encoding of categorical features.

        "}, {"location": "API/data_cleaning/imputer/#example", "title": "Example", "text": "atomstand-alone
        import numpy as np\nfrom atom import ATOMClassifier\nfrom numpy.random import randint\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n# Add some random missing values to the data\nfor i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):\n    X.iat[i, j] = np.NaN\n\natom = ATOMClassifier(X, y, random_state=1)\nprint(atom.nans)\n\natom.impute(strat_num=\"median\", max_nan_rows=0.1, verbose=2)\n\nprint(atom.n_nans)\n
        import numpy as np\nfrom atom.data_cleaning import Imputer\nfrom numpy.random import randint\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n# Add some random missing values to the data\nfor i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):\n    X.iloc[i, j] = np.nan\n\nimputer = Imputer(strat_num=\"median\", max_nan_rows=0.1, verbose=2)\nX, y = imputer.fit_transform(X, y)\n\nprint(X)\n
        "}, {"location": "API/data_cleaning/imputer/#methods", "title": "Methods", "text": "

        fitFit to data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDoes nothing.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformImpute the missing values.

        method fit(X, y=None)[source]Fit to data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        ReturnsImputer Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X=None, y=None)[source]Does nothing.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=None)[source]Impute the missing values.

        Note that leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Imputed dataframe.

        series Transformed target column. Only returned if provided.

        "}, {"location": "API/data_cleaning/normalizer/", "title": "Normalizer", "text": "

        class atom.data_cleaning.Normalizer(strategy=\"yeojohnson\", device=\"cpu\", engine=None, verbose=0, logger=None, random_state=None, **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.

        This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Categorical columns are ignored.

        This class can be accessed from atom through the normalize method. Read more in the user guide.

        Warning

        The quantile strategy performs a non-linear transformation. This may distort linear correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable.

        Note

        The yeojohnson and boxcox strategies scale the data after transforming. Use the kwargs to change this behaviour.

        Parametersstrategy: str, default=\"yeojohnson\" The transforming strategy. Choose from:

        device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

        engine: dict or None, default=None Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. If None, the default options are selected. Choose from:

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        random_state: int or None, default=None Seed used by the quantile strategy. If None, the random number generator is the RandomState used by np.random.

        **kwargs Additional keyword arguments for the strategy estimator.

        Attributes[strategy]: sklearn transformer Object with which the data is transformed.

        feature_names_in_: np.array Names of features seen during fit.

        n_features_in_: int Number of features seen during fit.

        See Also

        Cleaner Applies standard data cleaning steps on a dataset.

        Pruner Prune outliers from the data.

        Scaler Scale the data.

        "}, {"location": "API/data_cleaning/normalizer/#example", "title": "Example", "text": "atomstand-alone
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, random_state=1)\nprint(atom.dataset)\n\natom.plot_distribution(columns=0)\n\natom.normalize(verbose=2)\n\nprint(atom.dataset)\n\natom.plot_distribution(columns=0)\n
        from atom.data_cleaning import Normalizer\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\nnormalizer = Normalizer(verbose=2)\nX = normalizer.fit_transform(X)\n\nprint(X)\n
        "}, {"location": "API/data_cleaning/normalizer/#methods", "title": "Methods", "text": "

        fitFit to data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformApply the inverse transformation to the data.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformApply the transformations to the data.

        method fit(X, y=None)[source]Fit to data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        ReturnsNormalizer Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X, y=None)[source]Apply the inverse transformation to the data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        Returnsdataframe Original dataframe.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=None)[source]Apply the transformations to the data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        Returnsdataframe Normalized dataframe.

        "}, {"location": "API/data_cleaning/pruner/", "title": "Pruner", "text": "

        class atom.data_cleaning.Pruner(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, device=\"cpu\", engine=None, verbose=0, logger=None, **kwargs)[source]Prune outliers from the data.

        Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.

        This class can be accessed from atom through the prune method. Read more in the user guide.

        Info

        The \"sklearnex\" and \"cuml\" engines are only supported for strategy=\"dbscan\".

        Parametersstrategy: str or sequence, default=\"zscore\" Strategy with which to select the outliers. If sequence of strategies, only samples marked as outliers by all chosen strategies are dropped. Choose from:

        method: int, float or str, default=\"drop\" Method to apply on the outliers. Only the zscore strategy accepts another method than \"drop\". Choose from:

        max_sigma: int or float, default=3 Maximum allowed standard deviations from the mean of the column. If more, it is considered an outlier. Only if strategy=\"zscore\".

        include_target: bool, default=False Whether to include the target column in the search for outliers. This can be useful for regression tasks. Only if strategy=\"zscore\".

        device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

        engine: dict or None, default=None Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. If None, the default options are selected. Choose from:

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        **kwargs Additional keyword arguments for the strategy estimator. If sequence of strategies, the params should be provided in a dict with the strategy's name as key.

        Attributes[strategy]: sklearn estimator Object used to prune the data, e.g. pruner.iforest for the isolation forest strategy.

        See Also

        Balancer Balance the number of samples per class in the target column.

        Normalizer Transform the data to follow a Normal/Gaussian distribution.

        Scaler Scale the data.

        "}, {"location": "API/data_cleaning/pruner/#example", "title": "Example", "text": "atomstand-alone
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, random_state=1)\nprint(atom.dataset)\n\natom.prune(stratgey=\"iforest\", verbose=2)\n\n# Note the reduced number of rows\nprint(atom.dataset)\n\natom.plot_distribution(columns=0)\n
        from atom.data_cleaning import Normalizer\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\nnormalizer = Normalizer(verbose=2)\nX = normalizer.fit_transform(X)\n\n# Note the reduced number of rows\nprint(X)\n
        "}, {"location": "API/data_cleaning/pruner/#methods", "title": "Methods", "text": "

        fitDoes nothing.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDoes nothing.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformApply the outlier strategy on the data.

        method fit(X=None, y=None, **fit_params)[source]Does nothing.

        Implemented for continuity of the API.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsself Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X=None, y=None)[source]Does nothing.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=None)[source]Apply the outlier strategy on the data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set.

        series Transformed target column. Only returned if provided.

        "}, {"location": "API/data_cleaning/scaler/", "title": "Scaler", "text": "

        class atom.data_cleaning.Scaler(strategy=\"standard\", include_binary=False, device=\"cpu\", engine=None, verbose=0, logger=None, **kwargs)[source]Scale the data.

        Apply one of sklearn's scalers. Categorical columns are ignored.

        This class can be accessed from atom through the scale method. Read more in the user guide.

        Parametersstrategy: str, default=\"standard\" Strategy with which to scale the data. Choose from:

        include_binary: bool, default=False Whether to scale binary columns (only 0s and 1s).

        device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

        engine: dict or None, default=None Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. If None, the default options are selected. Choose from:

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        **kwargs Additional keyword arguments for the strategy estimator.

        Attributes[strategy]: sklearn transformer Object with which the data is scaled.

        feature_names_in_: np.array Names of features seen during fit.

        n_features_in_: int Number of features seen during fit.

        See Also

        Balancer Balance the number of samples per class in the target column.

        Normalizer Transform the data to follow a Normal/Gaussian distribution.

        Scaler Scale the data.

        "}, {"location": "API/data_cleaning/scaler/#example", "title": "Example", "text": "atomstand-alone
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, random_state=1)\nprint(atom.dataset)\n\natom.scale(verbose=2)\n\n# Note the reduced number of rows\nprint(atom.dataset)\n
        from atom.data_cleaning import Scaler\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\nscaler = Scaler(verbose=2)\nX = scaler.fit_transform(X)\n\n# Note the reduced number of rows\nprint(X)\n
        "}, {"location": "API/data_cleaning/scaler/#methods", "title": "Methods", "text": "

        fitFit to data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformApply the inverse transformation to the data.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformPerform standardization by centering and scaling.

        method fit(X, y=None)[source]Fit to data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        ReturnsScaler Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X, y=None)[source]Apply the inverse transformation to the data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        Returnsdataframe Scaled dataframe.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=None)[source]Perform standardization by centering and scaling.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        Returnsdataframe Scaled dataframe.

        "}, {"location": "API/feature_engineering/featureextractor/", "title": "FeatureExtractor", "text": "

        class atom.feature_engineering.FeatureExtractor(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, verbose=0, logger=None)[source]Extract features from datetime columns.

        Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype datetime64 are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.

        This class can be accessed from atom through the feature_extraction method. Read more in the user guide.

        Warning

        Decision trees based algorithms build their split rules according to one feature at a time. This means that they will fail to correctly process cyclic features since the sin/cos features should be considered one single coordinate system.

        Parametersfeatures: str or sequence, default=(\"day\", \"month\", \"year\") Features to create from the datetime columns. Note that created features with zero variance (e.g. the feature hour in a column that only contains dates) are ignored. Allowed values are datetime attributes from pandas.Series.dt.

        fmt: str, sequence or None, default=None Format (strptime) of the categorical columns that need to be converted to datetime. If sequence, the n-th format corresponds to the n-th categorical column that can be successfully converted. If None, the format is inferred automatically from the first non NaN value. Values that can not be converted are returned as NaT.

        encoding_type: str, default=\"ordinal\" Type of encoding to use. Choose from:

        drop_columns: bool, default=True Whether to drop the original columns after transformation.

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        Attributesfeature_names_in_: np.array Names of features seen during fit.

        n_features_in_: int Number of features seen during fit.

        See Also

        FeatureGenerator Generate new features.

        FeatureGrouper Extract statistics from similar features.

        FeatureSelector Reduce the number of features in the data.

        "}, {"location": "API/feature_engineering/featureextractor/#example", "title": "Example", "text": "atomstand-alone
        import pandas as pd\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n# Add a datetime column\nX[\"date\"] = pd.date_range(start=\"1/1/2018\", periods=len(X))\n\natom = ATOMClassifier(X, y)\natom.feature_extraction(features=[\"day\"], fmt=\"%d/%m/%Y\", verbose=2)\n\n# Note the date_day column\nprint(atom.dataset)\n
        import pandas as pd\nfrom atom.feature_engineering import FeatureExtractor\nfrom sklearn.datasets import load_breast_cancer\n\nX, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n# Add a datetime column\nX[\"date\"] = pd.date_range(start=\"1/1/2018\", periods=len(X))\n\nfe = FeatureExtractor(features=[\"day\"], fmt=\"%Y-%m-%d\", verbose=2)\nX = fe.transform(X)\n\n# Note the date_day column\nprint(X)\n
        "}, {"location": "API/feature_engineering/featureextractor/#methods", "title": "Methods", "text": "

        fitDoes nothing.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDoes nothing.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformExtract the new features.

        method fit(X=None, y=None, **fit_params)[source]Does nothing.

        Implemented for continuity of the API.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsself Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X=None, y=None)[source]Does nothing.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=None)[source]Extract the new features.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        Returnsdataframe Transformed feature set.

        "}, {"location": "API/feature_engineering/featuregenerator/", "title": "FeatureGenerator", "text": "

        class atom.feature_engineering.FeatureGenerator(strategy=\"dfs\", n_features=None, operators=None, n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)[source]Generate new features.

        Create new combinations of existing features to capture the non-linear relations between the original features.

        This class can be accessed from atom through the feature_generation method. Read more in the user guide.

        Warning

        Tip

        dfs can create many new features and not all of them will be useful. Use the FeatureSelector class to reduce the number of features.

        Parametersstrategy: str, default=\"dfs\" Strategy to crate new features. Choose from:

        n_features: int or None, default=None Maximum number of newly generated features to add to the dataset. If None, select all created features.

        operators: str, sequence or None, default=None Mathematical operators to apply on the features. None to use all. Choose from: add, sub, mul, div, abs, sqrt, log, inv, sin, cos, tan.

        n_jobs: int, default=1 Number of cores to use for parallel processing.

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

        **kwargs Additional keyword arguments for the SymbolicTransformer instance. Only for the gfg strategy.

        Attributesgfg: SymbolicTransformer Object used to calculate the genetic features. Only for the gfg strategy.

        genetic_features: pd.DataFrame Information on the newly created non-linear features. Only for the gfg strategy. Columns include:

        feature_names_in_: np.array Names of features seen during fit.

        n_features_in_: int Number of features seen during fit.

        See Also

        FeatureExtractor Extract features from datetime columns.

        FeatureGrouper Extract statistics from similar features.

        FeatureSelector Reduce the number of features in the data.

        "}, {"location": "API/feature_engineering/featuregenerator/#example", "title": "Example", "text": "atomstand-alone
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y)\natom.feature_generation(strategy=\"dfs\", n_features=5, verbose=2)\n\n# Note the texture error / worst symmetry column\nprint(atom.dataset)\n
        from atom.feature_engineering import FeatureGenerator\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\nfg = FeatureGenerator(strategy=\"dfs\", n_features=5, verbose=2)\nX = fg.fit_transform(X, y)\n\n# Note the radius error * worst smoothness column\nprint(X)\n
        "}, {"location": "API/feature_engineering/featuregenerator/#methods", "title": "Methods", "text": "

        fitFit to data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDoes nothing.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformGenerate new features.

        method fit(X, y=None)[source]Fit to data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsself Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X=None, y=None)[source]Does nothing.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=None)[source]Generate new features.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        Returnsdataframe Transformed feature set.

        "}, {"location": "API/feature_engineering/featuregrouper/", "title": "FeatureGrouper", "text": "

        class atom.feature_engineering.FeatureGrouper(group, operators=None, drop_columns=True, verbose=0, logger=None)[source]Extract statistics from similar features.

        Replace groups of features with related characteristics with new features that summarize statistical properties of te group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the groups method.

        This class can be accessed from atom through the feature_grouping method. Read more in the user guide.

        Tip

        Use a regex pattern with the groups parameter to select groups easier, e.g. atom.feature_grouping({\"group1\": \"var_.+\") to select all features that start with var_.

        Parametersgroup: dict Group names and features. Select the features by name, position or regex pattern. A feature can belong to multiple groups.

        operators: str, sequence or None, default=None Statistical operators to apply on the groups. Any operator from numpy or scipy.stats (checked in that order) that is applied on an array can be used. If None, it uses: min, max, mean, median, mode and std.

        drop_columns: bool, default=True Whether to drop the columns in groups after transformation.

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        Attributesgroups: dict Names and features of every created group.

        feature_names_in_: np.array Names of features seen during fit.

        n_features_in_: int Number of features seen during fit.

        See Also

        FeatureExtractor Extract features from datetime columns.

        FeatureGenerator Generate new features.

        FeatureSelector Reduce the number of features in the data.

        "}, {"location": "API/feature_engineering/featuregrouper/#example", "title": "Example", "text": "atomstand-alone
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y)\natom.feature_grouping({\"means\": [\"mean.+\"]}, verbose=2)\n\n# Note the mean features are gone and the new std(means) feature\nprint(atom.dataset)\n
        from atom.feature_engineering import FeatureGrouper\nfrom sklearn.datasets import load_breast_cancer\n\nX, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n# Group all features that start with mean\nfg = FeatureGrouper({\"means\": [\"mean.+\"]}, verbose=2)\nX = fg.transform(X)\n\n# Note the mean features are gone and the new std(means) feature\nprint(X)\n
        "}, {"location": "API/feature_engineering/featuregrouper/#methods", "title": "Methods", "text": "

        fitDoes nothing.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDoes nothing.logPrint message and save to log file.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformGroup features.

        method fit(X=None, y=None, **fit_params)[source]Does nothing.

        Implemented for continuity of the API.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsself Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X=None, y=None)[source]Does nothing.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=None)[source]Group features.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        Returnsdataframe Transformed feature set.

        "}, {"location": "API/feature_engineering/featureselector/", "title": "FeatureSelector", "text": "

        class atom.feature_engineering.FeatureSelector(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, n_jobs=1, device=\"cpu\", engine=None, backend=\"loky\", verbose=0, logger=None, random_state=None, **kwargs)[source]Reduce the number of features in the data.

        Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low variance features.

        This class can be accessed from atom through the feature_selection method. Read more in the user guide.

        Warning

        Info

        Tip

        Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead.

        Parametersstrategy: str or None, default=None Feature selection strategy to use. Choose from:

        solver: str, estimator or None, default=None Solver/estimator to use for the feature selection strategy. See the corresponding documentation for an extended description of the choices. If None, the default value is used (only if strategy=\"pca\"). Choose from:

        n_features: int, float or None, default=None Number of features to select.

        If strategy=\"sfm\" and the threshold parameter is not specified, the threshold is automatically set to -inf to select n_features number of features.

        If strategy=\"rfecv\", n_features is the minimum number of features to select.

        This parameter is ignored if any of the following strategies is selected: pso, hho, gwo, dfo, go.

        min_repeated: int, float or None, default=2 Remove categorical features if there isn't any repeated value in at least min_repeated rows. The default is to keep all features with non-maximum variance, i.e. remove the features which number of unique values is equal to the number of rows (usually the case for names, IDs, etc...).

        max_repeated: int, float or None, default=1.0 Remove categorical features with the same value in at least max_repeated rows. The default is to keep all features with non-zero variance, i.e. remove the features that have the same value in all samples.

        max_correlation: float or None, default=1.0 Minimum absolute Pearson correlation to identify correlated features. For each group, it removes all except the feature with the highest correlation to y (if provided, else it removes all but the first). The default value removes equal columns. If None, skip this step.

        n_jobs: int, default=1 Number of cores to use for parallel processing.

        device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. device=\"gpu\" to use the GPU. Read more in the user guide.

        engine: dict or None, default=None Execution engine to use for data and estimators. The value should be a dictionary with keys data and/or estimator, with their corresponding choice as values. If None, the default options are selected. Choose from:

        backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from:

        verbose: int, default=0 Verbosity level of the class. Choose from:

        logger: str, Logger or None, default=None

        random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the RandomState used by np.random.

        **kwargs Any extra keyword argument for the strategy estimator. See the corresponding documentation for the available options.

        Attributescollinear: pd.DataFrame Information on the removed collinear features. Columns include:

        [strategy]: sklearn transformer Object used to transform the data, e.g. fs.pca for the pca strategy.

        feature_names_in_: np.array Names of features seen during fit.

        n_features_in_: int Number of features seen during fit.

        See Also

        FeatureExtractor Extract features from datetime columns.

        FeatureGenerator Generate new features.

        FeatureGrouper Extract statistics from similar features.

        "}, {"location": "API/feature_engineering/featureselector/#example", "title": "Example", "text": "atomstand-alone
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y)\natom.feature_selection(strategy=\"pca\", n_features=12, verbose=2)\n\n# Note that the column names changed\nprint(atom.dataset)\n\natom.plot_pca()\n
        from atom.feature_engineering import FeatureSelector\nfrom sklearn.datasets import load_breast_cancer\n\nX, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\nfs = FeatureSelector(strategy=\"pca\", n_features=12, verbose=2)\nX = fs.fit_transform(X)\n\n# Note that the column names changed\nprint(X)\n
        "}, {"location": "API/feature_engineering/featureselector/#methods", "title": "Methods", "text": "

        fitFit the feature selector to the data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDoes nothing.logPrint message and save to log file.plot_componentsPlot the explained variance ratio per component.plot_pcaPlot the explained variance ratio vs number of components.plot_rfecvPlot the rfecv results.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.transformTransform the data.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.

        method fit(X, y=None)[source]Fit the feature selector to the data.

        The univariate, sfm (when model is not fitted), sfs, rfe and rfecv strategies need a target column. Leaving it None raises an exception.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsself Estimator instance.

        method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        **fit_params Additional keyword arguments for the fit method.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method get_metadata_routing()[source]Get metadata routing of this object.

        Returnsrouting : MetadataRequest A :class:~utils.metadata_routing.MetadataRequest encapsulating routing information.

        method get_params(deep=True)[source]Get parameters for this estimator.

        Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        Returnsparams : dict Parameter names mapped to their values.

        method inverse_transform(X=None, y=None)[source]Does nothing.

        ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored.

        y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to X.

        Returnsdataframe Transformed feature set. Only returned if provided.

        series Transformed target column. Only returned if provided.

        method log(msg, level=0, severity=\"info\")[source]Print message and save to log file.

        Parametersmsg: int, float or str Message to save to the logger and print to stdout.

        level: int, default=0 Minimum verbosity level to print the message.

        severity: str, default=\"info\" Severity level of the message. Choose from: debug, info, warning, error, critical.

        method plot_components(show=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the explained variance ratio per component.

        Kept components are colored and discarted components are transparent. This plot is available only when feature selection was applied with strategy=\"pca\".

        Parametersshow: int or None, default=None Number of components to show. None to show all.

        title: str, dict or None, default=None Title for the plot.

        legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices.

        figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of components shown.

        filename: str or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

        display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

        Returnsgo.Figure or None Plot object. Only returned if display=None.

        method plot_pca(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot the explained variance ratio vs number of components.

        If the underlying estimator is PCA (for dense datasets), all possible components are plotted. If the underlying estimator is TruncatedSVD (for sparse datasets), it only shows the selected components. The star marks the number of components selected by the user. This plot is available only when feature selection was applied with strategy=\"pca\".

        Parameterstitle: str, dict or None, default=None Title for the plot.

        legend: str, dict or None, default=None Does nothing. Implemented for continuity of the API.

        figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

        filename: str or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

        display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

        Returnsgo.Figure or None Plot object. Only returned if display=None.

        method plot_rfecv(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot the rfecv results.

        Plot the scores obtained by the estimator fitted on every subset of the dataset. Only available when feature selection was applied with strategy=\"rfecv\".

        Parameterstitle: str, dict or None, default=None Title for the plot.

        legend: str, dict or None, default=None Legend for the plot. See the user guide for an extended description of the choices.

        figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y).

        filename: str or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If filename has no file type, the plot is saved as html. If None, the plot is not saved.

        display: bool or None, default=True Whether to render the plot. If None, it returns the figure.

        Returnsgo.Figure or None Plot object. Only returned if display=None.

        method reset_aesthetics()[source]Reset the plot aesthetics to their default values.

        method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.

        Parametersfilename: str, default=\"auto\" Name of the file. Use \"auto\" for automatic naming.

        save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method.

        method set_params(**params)[source]Set the parameters of this estimator.

        Parameters**params : dict Estimator parameters.

        Returnsself : estimator instance Estimator instance.

        method transform(X, y=None)[source]Transform the data.

        ParametersX: dataframe-like Feature set with shape=(n_samples, n_features).

        y: int, str, sequence, dataframe-like or None, default=None Does nothing. Implemented for continuity of the API.

        Returnsdataframe Transformed feature set.

        method update_layout(**kwargs)[source]Update the properties of the plot's layout.

        Recursively update the structure of the original layout with the values in the arguments.

        Parameters**kwargs Keyword arguments for the figure's update_layout method.

        method update_traces(**kwargs)[source]Update the properties of the plot's traces.

        Recursively update the structure of the original traces with the values in the arguments.

        Parameters**kwargs Keyword arguments for the figure's update_traces method.

        "}, {"location": "API/models/adab/", "title": "AdaBoost", "text": "

        AdaBaccept sparse

        AdaBoost is a meta-estimator that begins by fitting a classifier/regressor on the original dataset and then fits additional copies of the algorithm on the same dataset but where the weights of instances are adjusted according to the error of the current prediction.

        Corresponding estimators are:

        Read more in sklearn's documentation.

        See Also

        GradientBoostingMachine Gradient Boosting Machine.

        RandomForest Random Forest.

        XGBoost Extreme Gradient Boosting.

        "}, {"location": "API/models/adab/#example", "title": "Example", "text": "
        from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, random_state=1)\natom.run(models=\"AdaB\", metric=\"f1\", verbose=2)\n
        "}, {"location": "API/models/adab/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression

        Parametersn_estimatorsIntDistribution(high=500, log=False, low=50, step=10)learning_rateFloatDistribution(high=10.0, log=True, low=0.01, step=None)algorithmCategoricalDistribution(choices=('SAMME.R', 'SAMME'))

        Parametersn_estimatorsIntDistribution(high=500, log=False, low=50, step=10)learning_rateFloatDistribution(high=10.0, log=True, low=0.01, step=None)lossCategoricalDistribution(choices=('linear', 'square', 'exponential'))

        "}, {"location": "API/models/adab/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/adab/#data-attributes", "title": "Data attributes", "text": "

        Attributespipeline: pd.SeriesTransformers fitted on the data.

        Models that used automated feature scaling have the scaler added. Use this attribute only to access the individual instances. To visualize the pipeline, use the plot_pipeline method.

        mapping: dictEncoded values and their respective mapped values.

        The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g. Ordinal, Leave-one-out, etc...).dataset: dataframeComplete data set.train: dataframeTraining set.test: dataframeTest set.X: dataframeFeature set.y: series | dataframeTarget column(s).X_train: dataframeFeatures of the training set.y_train: series | dataframeTarget column(s) of the training set.X_test: dataframeFeatures of the test set.y_test: series | dataframeTarget column(s) of the test set.shape: tuple[int, int]Shape of the dataset (n_rows, n_columns).columns: seriesName of all the columns.n_columns: intNumber of columns.features: seriesName of the features.n_features: intNumber of features.target: str | list[str]Name of the target column(s).

        "}, {"location": "API/models/adab/#utility-attributes", "title": "Utility attributes", "text": "

        Attributesname: strName of the model.

        Use the property's @setter to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes.study: Study | NoneOptuna study used for hyperparameter tuning.trials: pd.DataFrame | NoneOverview of the trials' results.

        All durations are in seconds. Columns include: